iwla

iwla Commit Details

Date:2015-05-13 18:13:18 (6 years 2 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:62be78845a7d4e1defbfa3c6ce54b91e4743cf5a
Parents: 157868dc3eee5ec842fbc65b5794c1e50d31f349
Message:Add debug traces in robots plugin

Changes:
Mplugins/pre_analysis/robots.py (6 diffs)

File differences

plugins/pre_analysis/robots.py
1919
2020
2121
22
2223
2324
2425
......
6162
6263
6364
65
6466
6567
68
69
70
71
6672
6773
6874
6975
70
76
77
78
7179
7280
7381
......
7684
7785
7886
79
87
8088
8189
8290
......
8593
8694
8795
88
96
8997
9098
9199
......
95103
96104
97105
98
106
99107
100108
101109
102110
103111
104
112
105113
106114
107115
......
109117
110118
111119
112
120
113121
114122
115123
116124
117
125
118126
#
import re
import logging
from iwla import IWLA
from iplugin import IPlugin
self.awstats_robots = map(lambda (x) : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots)
self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)
self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)
self.logger = logging.getLogger(self.__class__.__name__)
return True
def _setRobot(self, k, super_hit):
self.logger.debug('%s is a robot' % (k))
super_hit['robot'] = 1
# Basic rule to detect robots
def hook(self):
hits = self.iwla.getCurrentVisists()
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
if super_hit['robot']:
self.logger.debug('%s is a robot' % (k))
continue
isRobot = False
referers = 0
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']):
super_hit['robot'] = 1
self._setRobot(k, super_hit)
continue
for r in self.awstats_robots:
break
if isRobot:
super_hit['robot'] = 1
self._setRobot(k, super_hit)
continue
# 1) no pages view --> robot
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
super_hit['robot'] = 1
self._setRobot(k, super_hit)
continue
for hit in super_hit['requests']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
isRobot = True
self._setRobot(k, super_hit)
break
# 4) Any referer for hits
referers += 1
if isRobot:
super_hit['robot'] = 1
self._setRobot(k, super_hit)
continue
if not super_hit['viewed_pages'] and \
(super_hit['viewed_hits'] and not referers):
super_hit['robot'] = 1
self._setRobot(k, super_hit)
continue

Archive Download the corresponding diff file

Branches

Tags