1 | # -*- coding: utf-8 -*-␊ |
2 | #␊ |
3 | # Copyright Grégory Soutadé 2015␊ |
4 | ␊ |
5 | # This file is part of iwla␊ |
6 | ␊ |
7 | # iwla is free software: you can redistribute it and/or modify␊ |
8 | # it under the terms of the GNU General Public License as published by␊ |
9 | # the Free Software Foundation, either version 3 of the License, or␊ |
10 | # (at your option) any later version.␊ |
11 | #␊ |
12 | # iwla is distributed in the hope that it will be useful,␊ |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of␊ |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the␊ |
15 | # GNU General Public License for more details.␊ |
16 | #␊ |
17 | # You should have received a copy of the GNU General Public License␊ |
18 | # along with iwla. If not, see <http://www.gnu.org/licenses/>.␊ |
19 | #␊ |
20 | ␊ |
21 | import re␊ |
22 | import logging␊ |
23 | import inspect␊ |
24 | ␊ |
25 | from iwla import IWLA␊ |
26 | from iplugin import IPlugin␊ |
27 | ␊ |
28 | import awstats_data␊ |
29 | ␊ |
30 | """␊ |
31 | Pre analysis hook␊ |
32 | ␊ |
33 | Filter robots␊ |
34 | ␊ |
35 | Plugin requirements :␊ |
36 | None␊ |
37 | ␊ |
38 | Conf values needed :␊ |
39 | page_to_hit_conf*␊ |
40 | hit_to_page_conf*␊ |
41 | ␊ |
42 | Output files :␊ |
43 | None␊ |
44 | ␊ |
45 | Statistics creation :␊ |
46 | None␊ |
47 | ␊ |
48 | Statistics update :␊ |
49 | visits :␊ |
50 | remote_addr =>␊ |
51 | robot␊ |
52 | ␊ |
53 | Statistics deletion :␊ |
54 | None␊ |
55 | """␊ |
56 | ␊ |
57 | class IWLAPreAnalysisRobots(IPlugin):␊ |
58 | def __init__(self, iwla):␊ |
59 | super(IWLAPreAnalysisRobots, self).__init__(iwla)␊ |
60 | self.API_VERSION = 1␊ |
61 | ␊ |
62 | def load(self):␊ |
63 | self.awstats_robots = list(map(lambda x : re.compile(('.*%s.*') % (x), re.IGNORECASE), awstats_data.robots))␊ |
64 | self.robot_re = re.compile(r'.*bot.*', re.IGNORECASE)␊ |
65 | self.crawl_re = re.compile(r'.*crawl.*', re.IGNORECASE)␊ |
66 | self.logger = logging.getLogger(self.__class__.__name__)␊ |
67 | return True␊ |
68 | ␊ |
69 | def _setRobot(self, k, super_hit):␊ |
70 | callerframerecord = inspect.stack()[1]␊ |
71 | frame = callerframerecord[0]␊ |
72 | info = inspect.getframeinfo(frame)␊ |
73 | ␊ |
74 | self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))␊ |
75 | super_hit['robot'] = 1␊ |
76 | ␊ |
77 | # Basic rule to detect robots␊ |
78 | def hook(self):␊ |
79 | hits = self.iwla.getCurrentVisits()␊ |
80 | for (k, super_hit) in hits.items():␊ |
81 | if super_hit['robot']:␊ |
82 | self.logger.debug('%s is a robot' % (k))␊ |
83 | continue␊ |
84 | ␊ |
85 | isRobot = False␊ |
86 | referers = 0␊ |
87 | ␊ |
88 | first_page = super_hit['requests'][0]␊ |
89 | ␊ |
90 | if self.robot_re.match(first_page['http_user_agent']) or\␊ |
91 | self.crawl_re.match(first_page['http_user_agent']):␊ |
92 | self.logger.debug(first_page['http_user_agent'])␊ |
93 | self._setRobot(k, super_hit)␊ |
94 | continue␊ |
95 | ␊ |
96 | for r in self.awstats_robots:␊ |
97 | if r.match(first_page['http_user_agent']):␊ |
98 | isRobot = True␊ |
99 | break␊ |
100 | ␊ |
101 | if isRobot:␊ |
102 | self.logger.debug(first_page['http_user_agent'])␊ |
103 | self._setRobot(k, super_hit)␊ |
104 | continue␊ |
105 | ␊ |
106 | # 1) no pages view --> robot␊ |
107 | # if not super_hit['viewed_pages'][0]:␊ |
108 | # super_hit['robot'] = 1␊ |
109 | # continue␊ |
110 | ␊ |
111 | # 2) Less than 1 hit per page␊ |
112 | if super_hit['viewed_pages'][0] and (super_hit['viewed_hits'][0] < super_hit['viewed_pages'][0]):␊ |
113 | self._setRobot(k, super_hit)␊ |
114 | continue␊ |
115 | ␊ |
116 | # 3) no pages and not hit --> robot␊ |
117 | if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:␊ |
118 | self._setRobot(k, super_hit)␊ |
119 | continue␊ |
120 | ␊ |
121 | not_found_pages = 0␊ |
122 | for hit in super_hit['requests']:␊ |
123 | # 5) /robots.txt read␊ |
124 | if hit['extract_request']['http_uri'].endswith('/robots.txt'):␊ |
125 | self._setRobot(k, super_hit)␊ |
126 | break␊ |
127 | ␊ |
128 | if int(hit['status']) == 404 or int(hit['status']) == 403:␊ |
129 | not_found_pages += 1␊ |
130 | ␊ |
131 | # 6) Any referer for hits␊ |
132 | if not hit['is_page'] and hit['http_referer']:␊ |
133 | referers += 1␊ |
134 | ␊ |
135 | if isRobot:␊ |
136 | self._setRobot(k, super_hit)␊ |
137 | continue␊ |
138 | ␊ |
139 | # 7) more than 10 404/403 pages␊ |
140 | if not_found_pages > 10:␊ |
141 | self._setRobot(k, super_hit)␊ |
142 | continue␊ |
143 | ␊ |
144 | if not super_hit['viewed_pages'][0] and \␊ |
145 | (super_hit['viewed_hits'][0] and not referers):␊ |
146 | self._setRobot(k, super_hit)␊ |
147 | continue␊ |