Remove crawler from feed parsers

This commit is contained in:
Grégory Soutadé 2016-08-20 13:08:02 +02:00
parent 10d087ad70
commit e805e59c10
3 changed files with 19 additions and 3 deletions

View File

@ -76,7 +76,9 @@ class IWLADisplayFeeds(IPlugin):
table = display.createBlock(DisplayHTMLBlockTable, self.iwla._(u'All feeds parsers'), [self.iwla._(u'Host'), self.iwla._(u'Pages'), self.iwla._(u'Hits')])
table.setColsCSSClass(['', 'iwla_page', 'iwla_hit'])
for super_hit in hits.values():
if not super_hit['feed_parser']: continue
if not super_hit.get('feed_parser', False): continue
if super_hit['feed_parser'] == IWLAPostAnalysisFeeds.BAD_FEED_PARSER:
continue
nb_feeds_parsers += 1
address = super_hit['remote_addr']
if display_visitor_ip and\

View File

@ -55,6 +55,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
NOT_A_FEED_PARSER = 0
FEED_PARSER = 1
MERGED_FEED_PARSER = 2
BAD_FEED_PARSER = 3
def __init__(self, iwla):
super(IWLAPostAnalysisFeeds, self).__init__(iwla)
@ -71,6 +72,9 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in feeds:
self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
@ -94,7 +98,17 @@ class IWLAPostAnalysisFeeds(IPlugin):
self.merge_one_hit_only_feeds_parsers:
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
if not isFeedParser is None: continue
if isFeedParser:
if hit['feed_parser'] == self.BAD_FEED_PARSER: continue
if not hit.get('feed_name_analysed', False) and\
hit.get('dns_name_replaced', False):
hit['feed_name_analysed'] = True
addr = hit.get('remote_addr', None)
for r in self.bad_feeds_re:
if r.match(addr):
hit['feed_parser'] = self.BAD_FEED_PARSER
return
return
isFeedParser = self.NOT_A_FEED_PARSER
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()

View File

@ -67,7 +67,7 @@ class IWLAPostAnalysisReverseDNS(IPlugin):
hits = self.iwla.getCurrentVisits()
for (k, hit) in hits.items():
if hit.get('dns_analysed', False): continue
if not hit['feed_parser'] and\
if not hit.get('feed_parser', False) and\
not self.iwla.isValidVisitor(hit):
continue
try: