iwla

iwla Commit Details

Date:2014-11-26 21:06:36 (6 years 7 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:f8a48a71444da17df8ceb8bad24985ca97a82f1f
Parents: fec5e375e44310887e644dffcf440d9a22b98deb
Message:Split referers plugin in post_analysis and display Remove post_analysis top_visitors (done in display)

Changes:
Dplugins/post_analysis/top_visitors.py (full)
Aplugins/post_analysis/referers.py (full)
Mconf.py (1 diff)
Mplugins/display/referers.py (1 diff)
Mplugins/display/top_visitors.py (1 diff)

File differences

conf.py
1616
1717
1818
19
19
2020
2121
2222
DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['top_visitors']
post_analysis_hooks = ['referers']
# post_analysis_hooks = ['top_visitors', 'reverse_dns']
display_hooks = ['top_visitors', 'all_visits', 'referers']
plugins/display/referers.py
11
2
3
42
53
64
75
86
9
10
117
128
139
1410
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
11
7212
7313
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
14
15
16
17
18
11819
11920
12021
import time
import re
import xml.sax.saxutils as saxutils
from iwla import IWLA
from iplugin import IPlugin
from display import *
import awstats_data
class IWLADisplayReferers(IPlugin):
def __init__(self, iwla):
super(IWLADisplayReferers, self).__init__(iwla)
self.API_VERSION = 1
def _getSearchEngine(self, hashid):
#print 'Look for %s' % engine
for (k, e) in self.search_engines.items():
for (h,h_re) in e['hashid']:
if hashid == h:
return k
#print 'Not found %s' % (hashid)
return None
def load(self):
domain_name = self.iwla.getConfValue('domain_name', '')
if not domain_name:
print 'domain_name required in conf'
return False
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
self.search_engines = {}
for (hashid, name) in awstats_data.search_engines_hashid.items():
hashid_re = re.compile(r'.*%s.*' % (hashid))
if not name in self.search_engines.keys():
self.search_engines[name] = {
'hashid' : [(hashid, hashid_re)]
}
else:
self.search_engines[name]['hashid'].append((hashid, hashid_re))
#print 'Hashid %s => %s' % (name, hashid)
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
key = self._getSearchEngine(engine)
if key:
self.search_engines[key]['not_search_engine'] = not_engine_re
#self.html_parser = html.parser.HTMLParser()
return True
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
if not parameters or not key_phrase_re: return
for p in parameters.split('&'):
groups = key_phrase_re.match(p)
if groups:
key_phrase = groups.groupdict()['key_phrase']
key_phrase = key_phrase.replace('+', ' ').lower()
key_phrase = saxutils.unescape(key_phrase)
if not key_phrase in key_phrases.keys():
key_phrases[key_phrase] = 1
else:
key_phrases[key_phrase] += 1
break
self.requires = ['IWLAPostAnalysisReferers']
def hook(self):
stats = self.iwla.getCurrentVisists()
referers = {}
robots_referers = {}
search_engine_referers = {}
key_phrases = {}
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
is_search_engine = False
if self.own_domain_re.match(uri): continue
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
elif super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:
key = 'hits'
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
dictionary[uri][key] += 1
month_stats = self.iwla.getMonthStats()
referers = month_stats.get('referers', {})
robots_referers = month_stats.get('robots_referers', {})
search_engine_referers = month_stats.get('search_engine_referers', {})
key_phrases = month_stats.get('key_phrases', {})
top_referers = [(k, referers[k]['pages']) for k in referers.keys()]
top_referers = sorted(top_referers, key=lambda t: t[1], reverse=True)
plugins/display/top_visitors.py
88
99
1010
11
1211
1312
14
13
14
15
16
17
1518
1619
1720
18
21
1922
2023
2124
def __init__(self, iwla):
super(IWLADisplayTopVisitors, self).__init__(iwla)
self.API_VERSION = 1
self.requires = ['IWLAPostAnalysisTopVisitors']
def hook(self):
stats = self.iwla.getMonthStats()
hits = self.iwla.getValidVisitors()
top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
top_visitors = [hits[h[0]] for h in top_bandwidth[:10]]
index = self.iwla.getDisplayIndex()
table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen'])
for super_hit in stats['top_visitors']:
for super_hit in top_visitors:
address = super_hit['remote_addr']
if self.iwla.getConfValue('display_visitor_ip', False) and\
super_hit.get('dns_name_replaced', False):
plugins/post_analysis/referers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import time
import re
import xml.sax.saxutils as saxutils
from iwla import IWLA
from iplugin import IPlugin
import awstats_data
class IWLAPostAnalysisReferers(IPlugin):
def __init__(self, iwla):
super(IWLAPostAnalysisReferers, self).__init__(iwla)
self.API_VERSION = 1
def _getSearchEngine(self, hashid):
for (k, e) in self.search_engines.items():
for (h,h_re) in e['hashid']:
if hashid == h:
return k
return None
def load(self):
domain_name = self.iwla.getConfValue('domain_name', '')
if not domain_name:
print 'domain_name required in conf'
return False
self.own_domain_re = re.compile(r'.*%s.*' % (domain_name))
self.search_engines = {}
for (hashid, name) in awstats_data.search_engines_hashid.items():
hashid_re = re.compile(r'.*%s.*' % (hashid))
if not name in self.search_engines.keys():
self.search_engines[name] = {
'hashid' : [(hashid, hashid_re)]
}
else:
self.search_engines[name]['hashid'].append((hashid, hashid_re))
#print 'Hashid %s => %s' % (name, hashid)
for (name, known_url) in awstats_data.search_engines_knwown_url.items():
self.search_engines[name]['known_url'] = re.compile(known_url + '(?P<key_phrase>.+)')
for (engine, not_engine) in awstats_data.not_search_engines_keys.items():
not_engine_re = re.compile(r'.*%s.*' % (not_engine))
key = self._getSearchEngine(engine)
if key:
self.search_engines[key]['not_search_engine'] = not_engine_re
return True
def _extractKeyPhrase(self, key_phrase_re, parameters, key_phrases):
if not parameters or not key_phrase_re: return
for p in parameters.split('&'):
groups = key_phrase_re.match(p)
if groups:
key_phrase = groups.groupdict()['key_phrase']
key_phrase = key_phrase.replace('+', ' ').lower()
key_phrase = saxutils.unescape(key_phrase)
if not key_phrase in key_phrases.keys():
key_phrases[key_phrase] = 1
else:
key_phrases[key_phrase] += 1
break
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
stats = self.iwla.getCurrentVisists()
month_stats = self.iwla.getMonthStats()
referers = month_stats.get('referers', {})
robots_referers = month_stats.get('robots_referers', {})
search_engine_referers = month_stats.get('search_engine_referers', {})
key_phrases = month_stats.get('key_phrases', {})
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if time.mktime(r['time_decoded']) < start_time: continue
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
is_search_engine = False
if self.own_domain_re.match(uri): continue
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
elif super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:
key = 'hits'
if not uri in dictionary: dictionary[uri] = {'pages':0, 'hits':0}
dictionary[uri][key] += 1
month_stats['referers'] = referers
month_stats['robots_referers'] = robots_referers
month_stats['search_engine_referers'] = search_engine_referers
month_stats['key_phrases'] = key_phrases
plugins/post_analysis/top_visitors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from iwla import IWLA
from iplugin import IPlugin
class IWLAPostAnalysisTopVisitors(IPlugin):
def __init__(self, iwla):
super(IWLAPostAnalysisTopVisitors, self).__init__(iwla)
self.API_VERSION = 1
def hook(self):
hits = self.iwla.getValidVisitors()
stats = self.iwla.getMonthStats()
top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
stats['top_visitors'] = [hits[h[0]] for h in top_bandwidth[:10]]

Archive Download the corresponding diff file

Branches

Tags