iwla

iwla Commit Details

Date:2014-11-27 09:01:51 (6 years 8 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:dd8349ab082bca7acc7e4d722a0df60b46367f8c
Parents: 6b0ed18f35e6db6c8d0adcf02e69f458e0eef0bc
Message:Add option count_hit_only_visitors and function isValidForCurrentAnalysis()

Changes:
Mconf.py (1 diff)
Mdefault_conf.py (1 diff)
Miwla.py (6 diffs)
Mplugins/display/all_visits.py (2 diffs)
Mplugins/display/referers.py (1 diff)
Mplugins/display/top_visitors.py (2 diffs)
Mplugins/post_analysis/referers.py (3 diffs)
Mplugins/post_analysis/top_pages.py (3 diffs)
Mplugins/pre_analysis/page_to_hit.py (2 diffs)
Mplugins/pre_analysis/robots.py (1 diff)

File differences

conf.py
2222
2323
2424
25
26
27
25
26
reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo/$']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py']
count_hit_only_visitors = False
default_conf.py
2222
2323
2424
25
26
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200, 304]
count_hit_only_visitors = True
iwla.py
7575
7676
7777
78
79
80
81
7882
7983
8084
......
264268
265269
266270
267
268
271
269272
270273
271274
272275
273276
274277
275
278
279
276280
277281
278282
......
298302
299303
300304
301
305
306
307
308
309
310
311
312
302313
303314
304315
......
331342
332343
333344
334
335
336
345
346
347
348
349
350
337351
338352
339353
......
349363
350364
351365
352
366
353367
354368
355369
......
374388
375389
376390
377
391
378392
379393
380394
def getStartAnalysisTime(self):
return self.meta_infos['start_analysis_time']
def isValidForCurrentAnalysis(self, request):
cur_time = self.meta_infos['start_analysis_time']
return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
def _clearMeta(self):
self.meta_infos = {
'last_time' : None
#stats['requests'] = set()
stats['nb_visitors'] = 0
for k in visits.keys():
super_hit = visits[k]
for (k, super_hit) in visits.items():
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if not super_hit['hit_only']:
if conf.count_hit_only_visitors or\
super_hit['viewed_pages']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
self.current_analysis['month_stats'] = stats
self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
self.valid_visitors = {}
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
self.valid_visitors[k] = v
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
for k in stats.keys():
stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0
for k in visits.keys():
if visits[k]['robot']: continue
if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue
if v['last_access'].tm_mday == cur_time.tm_mday:
stats['nb_visitors'] += 1
print stats
self.analyse_started = True
else:
if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
if not self.isValidForCurrentAnalysis(hit):
return False
else:
self.analyse_started = True
return True
def start(self):
print '==> Analyse previous database'
print '==> Load previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
plugins/display/all_visits.py
1111
1212
1313
14
15
1416
1517
1618
......
2325
2426
2527
26
28
2729
2830
2931
def hook(self):
hits = self.iwla.getValidVisitors()
display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False)
last_access = sorted(hits.values(), key=lambda t: t['last_access'], reverse=True)
cur_time = self.iwla.getCurTime()
table = DisplayHTMLBlockTable('Last seen', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen'])
for super_hit in last_access:
address = super_hit['remote_addr']
if self.iwla.getConfValue('display_visitor_ip', False) and\
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
address = '%s [%s]' % (address, super_hit['remote_ip'])
plugins/display/referers.py
9191
9292
9393
94
9594
9695
9796
index.appendBlock(table)
# All key phrases in a file
cur_time = self.iwla.getCurTime()
title = time.strftime('Key Phrases - %B %Y', cur_time)
filename = 'key_phrases_%d.html' % (cur_time.tm_mon)
plugins/display/top_visitors.py
1111
1212
1313
14
15
1416
15
17
18
1619
1720
1821
......
2023
2124
2225
23
26
2427
2528
2629
def hook(self):
hits = self.iwla.getValidVisitors()
count_hit_only = self.iwla.getConfValue('count_hit_only_visitors', False)
display_visitor_ip = self.iwla.getConfValue('display_visitor_ip', False)
top_bandwidth = [(k,hits[k]['bandwidth']) for k in hits.keys()]
top_bandwidth = [(k,v['bandwidth']) for (k,v) in hits.items() \
if count_hit_only or v['viewed_pages']]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
top_visitors = [hits[h[0]] for h in top_bandwidth[:10]]
table = DisplayHTMLBlockTable('Top visitors', ['Host', 'Pages', 'Hits', 'Bandwidth', 'Last seen'])
for super_hit in top_visitors:
address = super_hit['remote_addr']
if self.iwla.getConfValue('display_visitor_ip', False) and\
if display_visitor_ip and\
super_hit.get('dns_name_replaced', False):
address = '%s [%s]' % (address, super_hit['remote_ip'])
plugins/post_analysis/referers.py
1
21
32
43
......
6665
6766
6867
69
70
7168
7269
7370
......
7875
7976
8077
81
78
8279
8380
8481
import time
import re
import xml.sax.saxutils as saxutils
break
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
stats = self.iwla.getCurrentVisists()
month_stats = self.iwla.getMonthStats()
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if time.mktime(r['time_decoded']) < start_time: continue
if not self.iwla.isValidForCurrentAnalysis(r): continue
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
plugins/post_analysis/top_pages.py
1
21
32
43
......
1413
1514
1615
17
18
19
2016
2117
2218
......
2723
2824
2925
30
26
3127
3228
3329
import time
import re
from iwla import IWLA
return True
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
stats = self.iwla.getCurrentVisists()
month_stats = self.iwla.getMonthStats()
for r in super_hit['requests']:
if not r['is_page']: continue
if time.mktime(r['time_decoded']) < start_time: continue
if not self.iwla.isValidForCurrentAnalysis(r): continue
uri = r['extract_request']['extract_uri']
if self.index_re.match(uri):
plugins/pre_analysis/page_to_hit.py
11
2
32
43
54
......
2120
2221
2322
24
25
26
2723
2824
2925
3026
3127
32
33
34
35
36
37
38
28
29
30
31
32
33
34
3935
4036
4137
import re
import time
from iwla import IWLA
from iplugin import IPlugin
return True
def hook(self):
start_time = self.iwla.getStartAnalysisTime()
start_time = time.mktime(start_time)
hits = self.iwla.getCurrentVisists()
viewed_http_codes = self.iwla.getConfValue('viewed_http_codes', [200, 304])
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if time.mktime(p['time_decoded']) < start_time: continue
uri = p['extract_request']['extract_uri']
for r in self.regexps:
if r.match(uri):
p['is_page'] = False
for request in super_hit['requests']:
if not request['is_page']: continue
if not self.iwla.isValidForCurrentAnalysis(request): continue
uri = request['extract_request']['extract_uri']
for regexp in self.regexps:
if regexp.match(uri):
request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break
plugins/pre_analysis/robots.py
1818
1919
2020
21
22
23
21
2422
2523
2624
2725
2826
2927
30
31
32
33
34
28
29
30
31
32
33
3534
36
37
38
35
36
37
3938
4039
4140
# Basic rule to detect robots
def hook(self):
hits = self.iwla.getCurrentVisists()
for k in hits.keys():
super_hit = hits[k]
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
isRobot = False
referers = 0
first_page = super_hit['requests'][0]
if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday:
for r in self.awstats_robots:
if r.match(first_page['http_user_agent']):
isRobot = True
break
if not self.iwla.isValidForCurrentAnalysis(first_page): continue
for r in self.awstats_robots:
if r.match(first_page['http_user_agent']):
isRobot = True
break
if isRobot:
super_hit['robot'] = 1
continue
if isRobot:
super_hit['robot'] = 1
continue
# 1) no pages view --> robot
# if not super_hit['viewed_pages']:

Archive Download the corresponding diff file

Branches

Tags