Rework feed merge algorithm. Allow to merge feeds based on name regular expression with merge_feeds_parsers_list conf value

This commit is contained in:
Gregory Soutade 2022-11-16 21:09:50 +01:00
parent 242bb6cabe
commit f8b37a625c
1 changed files with 38 additions and 20 deletions

View File

@ -44,6 +44,7 @@ Output files :
Statistics creation : Statistics creation :
remote_addr => remote_addr =>
feed_parser feed_parser
feed_name_analysed
Statistics update : Statistics update :
None None
@ -67,6 +68,7 @@ class IWLAPostAnalysisFeeds(IPlugin):
feeds = self.iwla.getConfValue('feeds', []) feeds = self.iwla.getConfValue('feeds', [])
feeds_referers = self.iwla.getConfValue('feeds_referers', []) feeds_referers = self.iwla.getConfValue('feeds_referers', [])
self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False) self.merge_feeds_parsers = self.iwla.getConfValue('merge_feeds_parsers', False)
_merge_feeds_parsers_list = self.iwla.getConfValue('merge_feeds_parsers_list', [])
if feeds is None: return False if feeds is None: return False
@ -86,35 +88,51 @@ class IWLAPostAnalysisFeeds(IPlugin):
for f in feeds_referers: for f in feeds_referers:
self.referers_uri.append(f) self.referers_uri.append(f)
self.merge_feeds_parsers_list = []
for f in _merge_feeds_parsers_list:
self.merge_feeds_parsers_list.append(re.compile(f))
self.merged_feeds = {}
return True return True
def mergeFeedsParsers(self, isFeedParser, one_hit_only, hit): def _appendToMergeCache(self, isFeedParser, key, hit):
# One hit only match # First time, register into dict
if isFeedParser: #isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1: if self.merged_feeds.get(key, None) is None:
user_agent = hit['requests'][0]['http_user_agent'].lower() # Merged
# First time, register into dict self.merged_feeds[key] = hit
if one_hit_only.get(user_agent, None) is None: else:
# Merged # Next time
one_hit_only[user_agent] = hit # Current must be ignored
else: hit['feed_parser'] = self.NOT_A_FEED_PARSER
# Next time # Previous matched hit must be set as merged
# Current must be ignored isFeedParser = self.MERGED_FEED_PARSER
hit['feed_parser'] = self.NOT_A_FEED_PARSER hit = self.merged_feeds[key]
# Previous matched hit must be set as merged
isFeedParser = self.MERGED_FEED_PARSER
hit = one_hit_only[user_agent]
hit['feed_parser'] = isFeedParser hit['feed_parser'] = isFeedParser
def mergeFeedsParsers(self, isFeedParser, hit):
if isFeedParser:
# One hit only match
if True or (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
for r in self.merge_feeds_parsers_list:
if r.match(hit['remote_addr']) or r.match(hit['remote_ip']):
#print('hit match %s' % (hit['remote_addr']))
self._appendToMergeCache(isFeedParser, r, hit)
return
#print("No match for %s : %d" % (hit['remote_addr'], hit['viewed_hits'][0] + hit['not_viewed_hits'][0]))
# Other cases, look for user agent
user_agent = hit['requests'][0]['http_user_agent'].lower()
self._appendToMergeCache(isFeedParser, user_agent, hit)
def hook(self): def hook(self):
hits = self.iwla.getCurrentVisits() hits = self.iwla.getCurrentVisits()
one_hit_only = {}
for hit in hits.values(): for hit in hits.values():
isFeedParser = hit.get('feed_parser', None) isFeedParser = hit.get('feed_parser', None)
# Register already tagged feed parser in one_hit_only # Register already tagged feed parser in merged_feeds
if self.merge_feeds_parsers and\ if self.merge_feeds_parsers and\
not isFeedParser in (None, self.BAD_FEED_PARSER): not isFeedParser in (None, self.BAD_FEED_PARSER):
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) self.mergeFeedsParsers(isFeedParser, hit)
continue continue
if isFeedParser: if isFeedParser:
@ -157,6 +175,6 @@ class IWLAPostAnalysisFeeds(IPlugin):
break break
if self.merge_feeds_parsers: if self.merge_feeds_parsers:
self.mergeFeedsParsers(isFeedParser, one_hit_only, hit) self.mergeFeedsParsers(isFeedParser, hit)
else: else:
hit['feed_parser'] = isFeedParser hit['feed_parser'] = isFeedParser