iwla

iwla Git Source Tree

Root/plugins/post_analysis/referers.py

1# -*- coding: utf-8 -*-
2#
3# Copyright Grégory Soutadé 2015
4
5# This file is part of iwla
6
7# iwla is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# iwla is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with iwla. If not, see <http://www.gnu.org/licenses/>.
19#
20
21import re
22import urllib
23
24from iwla import IWLA
25from iplugin import IPlugin
26
27import awstats_data
28
29"""
30Post analysis hook
31
32Extract referers and key phrases from requests
33
34Plugin requirements :
35 None
36
37Conf values needed :
38 domain_name
39
40Output files :
41 None
42
43Statistics creation :
44 None
45
46Statistics update :
47month_stats :
48 referers =>
49 pages => count
50 hits => count
51 robots_referers =>
52 pages => count
53 hits => count
54 search_engine_referers =>
55 pages => count
56 hits => count
57 key_phrases =>
58 phrase => count
59
60Statistics deletion :
61 None
62"""
63
64class IWLAPostAnalysisReferers(IPlugin):
65 def __init__(self, iwla):
66 super(IWLAPostAnalysisReferers, self).__init__(iwla)
67 self.API_VERSION = 1
68 self.conf_requires = ['domain_name']
69
70 def _getSearchEngine(self, hashid):
71 for (k, e) in self.search_engines.items():
72 for (h,h_re) in e['hashid']:
73 if hashid == h:
74 return k
75 return None
76
77 def load(self):
78 domain_name = self.iwla.getConfValue('domain_name', '')
79
80 if not domain_name:
81 print('domain_name must not be empty !')
82 return False
83
84 self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
85 self.search_engines = {}
86
87 for (hashid, name) in awstats_data.search_engines_hashid.items():
88 hashid_re = re.compile(r'.*%s.*' % (hashid))
89 if not name in self.search_engines.keys():
90 self.search_engines[name] = {
91 'hashid' : [(hashid, hashid_re)]
92 }
93 else:
94 self.search_engines[name]['hashid'].append((hashid, hashid_re))
95 #print 'Hashid %s => %s' % (name, hashid)
96
97 for (name, known_url) in awstats_data.search_engines_knwown_url.items():
98 self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
99
100 for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
101 not_engine_re = re.compile(r'.*%s.*' % (not_engine))
102 key = self._getSearchEngine(engine)
103 if key:
104 self.search_engines[key]['not_search_engine'] = not_engine_re
105
106 return True
107
108 def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
109 if not parameters or not key_phrase_re: return
110
111 for p in parameters.split('&'):
112 groups = key_phrase_re.match(p)
113 if groups:
114 key_phrase = groups.groupdict()['key_phrase']
115 try:
116 key_phrase = urllib.unquote_plus(key_phrase).decode('utf8')
117 except Exception as e:
118 print(e)
119 continue
120 if not key_phrase in key_phrases.keys():
121 key_phrases[key_phrase] = 1
122 else:
123 key_phrases[key_phrase] += 1
124 break
125
126 def hook(self):
127 stats = self.iwla.getCurrentVisits()
128 month_stats = self.iwla.getMonthStats()
129
130 referers = month_stats.get('referers', {})
131 robots_referers = month_stats.get('robots_referers', {})
132 search_engine_referers = month_stats.get('search_engine_referers', {})
133 key_phrases = month_stats.get('key_phrases', {})
134
135 for (k, super_hit) in stats.items():
136 for r in super_hit['requests'][::-1]:
137 if not self.iwla.isValidForCurrentAnalysis(r): break
138 if not r['http_referer']: continue
139
140 uri = r['extract_referer']['extract_uri']
141 if self.own_domain_re.match(uri): continue
142
143 if super_hit['robot']:
144 dictionary = robots_referers
145 # print '%s => %s' % (uri, super_hit['remote_ip'])
146 else:
147 is_search_engine = False
148 for (name, engine) in self.search_engines.items():
149 for (hashid, hashid_re) in engine['hashid']:
150 if not hashid_re.match(uri): continue
151
152 not_engine = engine.get('not_search_engine', None)
153 # Try not engine
154 if not_engine and not_engine.match(uri): break
155 is_search_engine = True
156 uri = name
157
158 parameters = r['extract_referer'].get('extract_parameters', None)
159 key_phrase_re = engine.get('known_url', None)
160
161 self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
162 break
163
164 if is_search_engine:
165 dictionary = search_engine_referers
166 else:
167 dictionary = referers
168
169 if r['is_page']:
170 key = 'pages'
171 else:
172 key = 'hits'
173 if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
174 dictionary[uri][key] += 1
175
176 month_stats['referers'] = referers
177 month_stats['robots_referers'] = robots_referers
178 month_stats['search_engine_referers'] = search_engine_referers
179 month_stats['key_phrases'] = key_phrases

Archive Download this file

Branches

Tags