iwla

iwla Git Source Tree

Root/plugins/post_analysis/feeds.py

1# -*- coding: utf-8 -*-
2#
3# Copyright Grégory Soutadé 2015
4
5# This file is part of iwla
6
7# iwla is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# iwla is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with iwla. If not, see <http://www.gnu.org/licenses/>.
19#
20
21import re
22
23from iwla import IWLA
24from iplugin import IPlugin
25
26"""
27Post analysis hook
28
29Find feeds parsers (first hit in feeds conf value and no viewed pages if it's a robot)
30If there is ony one hit per day to a feed, merge feeds parsers with the same user agent
31as it must be the same person with a different IP address.
32
33Plugin requirements :
34 None
35
36Conf values needed :
37 feeds
38 merge_one_hit_only_feeds_parsers*
39
40Output files :
41 None
42
43Statistics creation :
44 remote_addr =>
45 feed_parser
46
47Statistics update :
48 None
49
50Statistics deletion :
51 None
52"""
53
54class IWLAPostAnalysisFeeds(IPlugin):
55 NOT_A_FEED_PARSER = 0
56 FEED_PARSER = 1
57 MERGED_FEED_PARSER = 2
58 BAD_FEED_PARSER = 3
59
60 def __init__(self, iwla):
61 super(IWLAPostAnalysisFeeds, self).__init__(iwla)
62 self.API_VERSION = 1
63 self.conf_requires = ['feeds']
64
65 def load(self):
66 feeds = self.iwla.getConfValue('feeds', None)
67 self.merge_one_hit_only_feeds_parsers = self.iwla.getConfValue('merge_one_hit_only_feeds_parsers', True)
68
69 if feeds is None: return False
70
71 self.feeds_re = []
72 for f in feeds:
73 self.feeds_re.append(re.compile(r'.*%s.*' % (f)))
74
75 self.bad_feeds_re = []
76 self.bad_feeds_re.append(re.compile(r'.*crawl.*'))
77
78 self.user_agents_re = []
79 self.user_agents_re.append(re.compile(r'.*rss.*'))
80 self.user_agents_re.append(re.compile(r'.*atom.*'))
81 self.user_agents_re.append(re.compile(r'.*feed.*'))
82
83 return True
84
85 def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
86 if isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
87 user_agent = hit['requests'][0]['http_user_agent'].lower()
88 if one_hit_only.get(user_agent, None) is None:
89 # Merged
90 isFeedParser = self.MERGED_FEED_PARSER
91 one_hit_only[user_agent] = (hit)
92 else:
93 isFeedParser = self.NOT_A_FEED_PARSER
94 hit['feed_parser'] = isFeedParser
95
96 def hook(self):
97 hits = self.iwla.getCurrentVisits()
98 one_hit_only = {}
99 for hit in hits.values():
100 isFeedParser = hit.get('feed_parser', None)
101
102 if isFeedParser == self.FEED_PARSER and\
103 self.merge_one_hit_only_feeds_parsers:
104 self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
105
106 if isFeedParser:
107 if hit['feed_parser'] == self.BAD_FEED_PARSER: continue
108 if not hit.get('feed_name_analysed', False) and\
109 hit.get('dns_name_replaced', False):
110 hit['feed_name_analysed'] = True
111 addr = hit.get('remote_addr', None)
112 for r in self.bad_feeds_re:
113 if r.match(addr):
114 hit['feed_parser'] = self.BAD_FEED_PARSER
115 return
116 return
117
118 isFeedParser = self.NOT_A_FEED_PARSER
119 uri = hit['requests'][0]['extract_request']['extract_uri'].lower()
120 for regexp in self.feeds_re:
121 if regexp.match(uri):
122 isFeedParser = self.FEED_PARSER
123 # Robot that views pages -> bot
124 if hit['robot']:
125 if hit['not_viewed_pages'][0]:
126 isFeedParser = self.NOT_A_FEED_PARSER
127 break
128
129 if isFeedParser == self.NOT_A_FEED_PARSER:
130 user_agent = hit['requests'][0]['http_user_agent'].lower()
131 for regexp in self.user_agents_re:
132 if regexp.match(user_agent):
133 isFeedParser = self.FEED_PARSER
134 break
135
136 if self.merge_one_hit_only_feeds_parsers:
137 self.mergeOneHitOnlyFeedsParsers(isFeedParser, one_hit_only, hit)
138 else:
139 hit['feed_parser'] = isFeedParser

Archive Download this file

Branches

Tags