iwla

iwla Commit Details

Date:2021-06-03 08:50:46 (1 month 26 days ago)
Author:Grégory Soutadé
Branch:dev
Commit:4cd7712201f35ccdc2cfe896f0d61b66ff6a598d
Parents: 2545ca5e52fba1157b0d67e12b63b18ba4178a00
Message:Fix feeds re and add 'feeds_referers' configuration

Changes:
Mplugins/post_analysis/feeds.py (5 diffs)

File differences

plugins/post_analysis/feeds.py
3535
3636
3737
38
3839
3940
4041
......
6364
6465
6566
66
67
68
6769
6870
6971
7072
7173
7274
73
75
7476
7577
7678
......
8082
8183
8284
85
86
87
88
8389
8490
8591
......
115121
116122
117123
124
118125
119
126
120127
121128
122129
......
127134
128135
129136
130
137
131138
132139
133140
134141
135142
143
144
145
146
147
148
149
150
136151
137152
138153
Conf values needed :
feeds
feeds_referers*
merge_one_hit_only_feeds_parsers*
Output files :
self.conf_requires = ['feeds']
def load(self):
feeds = self.iwla.getConfValue('feeds', None)
feeds = self.iwla.getConfValue('feeds', [])
feeds_referers = self.iwla.getConfValue('feeds_referers', [])
self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True)
if feeds is None: return False
self.feeds_re = []
for f in feeds:
self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
self.feeds_re.append(re.compile(f))
self.bad_feeds_re = []
self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
self.user_agents_re.append(re.compile(r'.*atom.*'))
self.user_agents_re.append(re.compile(r'.*feed.*'))
self.referers_uri = []
for f in feeds_referer:
self.referers_uri.append(f)
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
return
return
request = hit['requests'][0]
isFeedParser = self.NOT_A_FEED_PARSER
uri = hit['requests'][0]['extract_request']['extract_uri'].lower()
uri = request['extract_request']['extract_uri'].lower()
for regexp in self.feeds_re:
if regexp.match(uri):
isFeedParser = self.FEED_PARSER
break
if isFeedParser == self.NOT_A_FEED_PARSER:
user_agent = hit['requests'][0]['http_user_agent'].lower()
user_agent = request['http_user_agent'].lower()
for regexp in self.user_agents_re:
if regexp.match(user_agent):
isFeedParser = self.FEED_PARSER
break
if isFeedParser == self.NOT_A_FEED_PARSER and\
request.get('extract_referer', False):
referer = request['extract_referer']['extract_uri'].lower()
for uri in self.referers_uri:
if referer == uri:
isFeedParser = self.FEED_PARSER
break
if self.merge_one_hit_only_feeds_parsers:
self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
else:

Archive Download the corresponding diff file

Branches

Tags