iwla

iwla Git Source Tree

Root/plugins/post_analysis/referers.py

1import re
2import urllib
3
4from iwla import IWLA
5from iplugin import IPlugin
6
7import awstats_data
8
9#
10# Post analysis hook
11#
12# Extract referers and key phrases from requests
13#
14# Plugin requirements :
15# None
16#
17# Conf values needed :
18# domain_name
19#
20# Output files :
21# None
22#
23# Statistics creation :
24# None
25#
26# Statistics update :
27# month_stats :
28# referers =>
29# pages
30# hits
31# robots_referers =>
32# pages
33# hits
34# search_engine_referers =>
35# pages
36# hits
37# key_phrases =>
38# phrase
39#
40# Statistics deletion :
41# None
42#
43
44class IWLAPostAnalysisReferers(IPlugin):
45 def __init__(self, iwla):
46 super(IWLAPostAnalysisReferers, self).__init__(iwla)
47 self.API_VERSION = 1
48 self.conf_requires = ['domain_name']
49
50 def _getSearchEngine(self, hashid):
51 for (k, e) in self.search_engines.items():
52 for (h,h_re) in e['hashid']:
53 if hashid == h:
54 return k
55 return None
56
57 def load(self):
58 domain_name = self.iwla.getConfValue('domain_name', '')
59
60 if not domain_name:
61 print 'domain_name must not be empty !'
62 return False
63
64 self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
65 self.search_engines = {}
66
67 for (hashid, name) in awstats_data.search_engines_hashid.items():
68 hashid_re = re.compile(r'.*%s.*' % (hashid))
69 if not name in self.search_engines.keys():
70 self.search_engines[name] = {
71 'hashid' : [(hashid, hashid_re)]
72 }
73 else:
74 self.search_engines[name]['hashid'].append((hashid, hashid_re))
75 #print 'Hashid %s => %s' % (name, hashid)
76
77 for (name, known_url) in awstats_data.search_engines_knwown_url.items():
78 self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
79
80 for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
81 not_engine_re = re.compile(r'.*%s.*' % (not_engine))
82 key = self._getSearchEngine(engine)
83 if key:
84 self.search_engines[key]['not_search_engine'] = not_engine_re
85
86 return True
87
88 def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
89 if not parameters or not key_phrase_re: return
90
91 for p in parameters.split('&'):
92 groups = key_phrase_re.match(p)
93 if groups:
94 key_phrase = groups.groupdict()['key_phrase']
95 key_phrase = urllib.unquote_plus(key_phrase).decode('utf8')
96 if not key_phrase in key_phrases.keys():
97 key_phrases[key_phrase] = 1
98 else:
99 key_phrases[key_phrase] += 1
100 break
101
102 def hook(self):
103 stats = self.iwla.getCurrentVisists()
104 month_stats = self.iwla.getMonthStats()
105
106 referers = month_stats.get('referers', {})
107 robots_referers = month_stats.get('robots_referers', {})
108 search_engine_referers = month_stats.get('search_engine_referers', {})
109 key_phrases = month_stats.get('key_phrases', {})
110
111 for (k, super_hit) in stats.items():
112 for r in super_hit['requests']:
113 if not self.iwla.isValidForCurrentAnalysis(r): continue
114 if not r['http_referer']: continue
115
116 uri = r['extract_referer']['extract_uri']
117 is_search_engine = False
118
119 if self.own_domain_re.match(uri): continue
120
121 for (name, engine) in self.search_engines.items():
122 for (hashid, hashid_re) in engine['hashid']:
123 if not hashid_re.match(uri): continue
124
125 not_engine = engine.get('not_search_engine', None)
126 # Try not engine
127 if not_engine and not_engine.match(uri): break
128 is_search_engine = True
129 uri = name
130
131 parameters = r['extract_referer'].get('extract_parameters', None)
132 key_phrase_re = engine.get('known_url', None)
133
134 self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
135 break
136
137 if is_search_engine:
138 dictionary = search_engine_referers
139 elif super_hit['robot']:
140 dictionary = robots_referers
141 # print '%s => %s' % (uri, super_hit['remote_ip'])
142 else:
143 dictionary = referers
144 if r['is_page']:
145 key = 'pages'
146 else:
147 key = 'hits'
148 if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
149 dictionary[uri][key] += 1
150
151 month_stats['referers'] = referers
152 month_stats['robots_referers'] = robots_referers
153 month_stats['search_engine_referers'] = search_engine_referers
154 month_stats['key_phrases'] = key_phrases

Archive Download this file

Branches

Tags