Filter robot with *bot* and *crawl* re

This commit is contained in:
Gregory Soutade 2015-01-11 18:06:44 +01:00
parent 00ad08a201
commit 4c74a14037
2 changed files with 8 additions and 3 deletions

View File

@ -190,7 +190,7 @@ class IWLADisplayReferers(IPlugin):
# All key phrases in a file # All key phrases in a file
if self.create_all_key_phrases: if self.create_all_key_phrases:
title = createCurTitle(self.iwla, u'All Key Phrases') title = createCurTitle(self.iwla, self.iwla._(u'All Key Phrases'))
filename = 'key_phrases.html' filename = 'key_phrases.html'
path = self.iwla.getCurDisplayPath(filename) path = self.iwla.getCurDisplayPath(filename)

View File

@ -59,7 +59,8 @@ class IWLAPreAnalysisRobots(IPlugin):
def load(self): def load(self):
self.awstats_robots = map(lambda (x) : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots) self.awstats_robots = map(lambda (x) : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots)
self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)
self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)
return True return True
# Basic rule to detect robots # Basic rule to detect robots
@ -72,7 +73,11 @@ class IWLAPreAnalysisRobots(IPlugin):
referers = 0 referers = 0
first_page = super_hit['requests'][0] first_page = super_hit['requests'][0]
if not self.iwla.isValidForCurrentAnalysis(first_page): continue
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']):
super_hit['robot'] = 1
continue
for r in self.awstats_robots: for r in self.awstats_robots:
if r.match(first_page['http_user_agent']): if r.match(first_page['http_user_agent']):