Fix feeds re and add 'feeds_referers' configuration

This commit is contained in:
Gregory Soutade 2021-06-03 08:50:46 +02:00
parent 2545ca5e52
commit 4cd7712201
1 changed files with 19 additions and 4 deletions

View File

@ -35,6 +35,7 @@ Plugin requirements :
Conf values needed :
feeds
feeds_referers*
merge_one_hit_only_feeds_parsers*
Output files :
@ -63,14 +64,15 @@ class IWLAPostAnalysisFeeds(IPlugin):
self.conf_requires = ['feeds']
def load(self):
feeds = self.iwla.getConfValue('feeds', None)
feeds = self.iwla.getConfValue('feeds', [])
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True)
if feeds is None: return False
self.feeds_re = []
for f in feeds:
self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
self.feeds_re.append(re.compile(f))
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
@ -80,6 +82,10 @@ class IWLAPostAnalysisFeeds(IPlugin):
self.user_agents_re.append(re.compile(r'.*atom.*'))
self.user_agents_re.append(re.compile(r'.*feed.*'))
self.referers_uri = []
for f in feeds_referer:
self.referers_uri.append(f)
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
@ -115,8 +121,9 @@ class IWLAPostAnalysisFeeds(IPlugin):
return
return
request = hit['requests'][0]
isFeedParser = self.NOT_A_FEED_PARSER
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()
uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri):
isFeedParser = self.FEED_PARSER
@ -127,12 +134,20 @@ class IWLAPostAnalysisFeeds(IPlugin):
break
if isFeedParser == self.NOT_A_FEED_PARSER:
user_agent = hit['requests'][0]['http_user_agent'].lower()
user_agent = request['http_user_agent'].lower()
for regexp in self.user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.FEED_PARSER
break
if isFeedParser == self.NOT_A_FEED_PARSER and\
request.get('extract_referer', False):
referer = request['extract_referer']['extract_uri'].lower()
for uri in self.referers_uri:
if referer == uri:
isFeedParser = self.FEED_PARSER
break
if self.merge_one_hit_only_feeds_parsers:
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
else: