iwla

iwla Commit Details

Date:2014-11-20 16:15:57 (6 years 8 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:f3cb04b16cf8856294ca426d1e5aa6bae87a719c
Parents: 4cc29487a298157892ea61f361fbfc05e262d8ff
Message:Externalize plugins

Changes:
Dhooks/pre_analysis/H001_soutade.py (full)
Dhooks/pre_analysis/H002_robot.py (full)
Rrobots.py → awstats_robots_data.py
Mconf.py (1 diff)
Miwla.py (6 diffs)
Mplugins/pre_analysis/H001_robot.py (2 diffs)
Mplugins/pre_analysis/H002_soutade.py (2 diffs)

File differences

conf.py
1010
1111
1212
13
14
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
hooks/pre_analysis/H001_soutade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re
# Remove logo from indefero
logo_re = re.compile(r'^.+/logo/$')
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['pages']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
if super_hit['viewed_pages']:
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
hooks/pre_analysis/H002_robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
isRobot = False
referers = 0
# 1) no pages view --> robot
# if not super_hit['viewed_pages']:
# super_hit['robot'] = 1
# continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
super_hit['robot'] = 1
continue
elif not super_hit['viewed_pages']:
# Hit only
super_hit['hit_only'] = 1
for hit in super_hit['pages']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'] == '/robots.txt':
isRobot = True
break
# 4) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
if isRobot:
super_hit['robot'] = 1
continue
if super_hit['viewed_hits'] and not referers:
super_hit['robot'] = 1
continue
iwla.py
88
99
1010
11
12
1311
1412
1513
......
2119
2220
2321
22
23
24
25
2426
2527
2628
......
4042
4143
4244
43
44
45
46
45
46
47
48
4749
4850
4951
50
51
52
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
5375
5476
5577
......
97119
98120
99121
100
101
102
103
104
122
123
124
105125
106
107
108
109
110
126
111127
112128
113129
......
164180
165181
166182
167
183
168184
169185
170186
171
172
173
174
175
176
177187
178188
179189
......
385395
386396
387397
398
399
388400
389401
390402
import pickle
import gzip
from robots import awstats_robots;
# Default configuration
DB_ROOT = './output/'
time_format = '%d/%b/%Y:%H:%M:%S +0100'
pre_analysis_hooks = []
post_analysis_hooks = []
display_hooks = []
from conf import *
print '==> Start'
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200]
HOOKS_ROOT = './hooks/'
PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/*.py'
POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/*.py'
DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/*.py'
HOOKS_ROOT = './plugins/'
PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/'
POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/'
DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/'
META_PATH = DB_ROOT + 'meta.db'
DB_FILENAME = 'iwla.db'
print '==> Generating robot dictionary'
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, POST_HOOK_DIRECTORY : post_analysis_hooks, DISPLAY_HOOK_DIRECTORY : display_hooks}
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
def preloadPlugins():
for root in plugins.keys():
for plugin_name in plugins[root]:
p = root + '/' + plugin_name
try:
mod = cache_plugins[p] = imp.load_source('hook', p)
infos = mod.get_plugins_infos()
if infos['class'] != ANALYSIS_CLASS or \
API_VERSION < infos['min_version'] or\
(infos['max_version'] != -1 and (API_VERSION > infos['max_version'])):
del cache_plugins[p]
elif not mod.load():
del cache_plugins[p]
except Exception as e:
print 'Error loading \'%s\' => %s' % (p, e)
return False
return True
def createEmptyVisits():
visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}}
return pickle.load(f)
return None
def callPlugins(path, *kwargs):
print '==> Call plugins (%s)' % path
plugins = glob.glob(path)
plugins.sort()
for p in plugins:
def callPlugins(root, *kwargs):
print '==> Call plugins (%s)' % root
for p in plugins[root]:
print '\t%s' % (p)
if not p in cache_plugins:
mod = imp.load_source('hook', p)
cache_plugins[p] = mod
else:
mod = cache_plugins[p]
mod = cache_plugins[root + '/' + p]
mod.hook(*kwargs)
def isPage(request):
super_hit['bandwith'] = 0;
super_hit['last_access'] = meta_visit['last_time']
super_hit['pages'] = [];
super_hit['robot'] = isRobot(hit);
super_hit['robot'] = False
super_hit['hit_only'] = 0;
appendHit(hit)
def isRobot(hit):
for r in awstats_robots:
if r.match(hit['http_user_agent']):
return True
return False
def decodeHTTPRequest(hit):
if not 'request' in hit.keys(): return False
return True
preloadPlugins()
print '==> Analysing log'
meta_visit = deserialize(META_PATH) or createEmptyMeta()
plugins/pre_analysis/H001_robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
121
222
323
......
1030
1131
1232
33
34
35
36
37
1338
1439
1540
import re
from awstats_robots_data import awstats_robots
PLUGIN_CLASS = 'HTTP'
API_VERSION = 1
def get_plugins_infos():
infos = {'class' : PLUGIN_CLASS,
'min_version' : API_VERSION,
'max_version' : -1}
return infos
def load():
global awstats_robots
print '==> Generating robot dictionary'
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
return True
# Basic rule to detect robots
isRobot = False
referers = 0
for r in awstats_robots:
if r.match(super_hit['pages'][0]['http_user_agent']):
super_hit['robot'] = 1
continue
# 1) no pages view --> robot
if not super_hit['viewed_pages']:
super_hit['robot'] = 1
plugins/pre_analysis/H002_soutade.py
33
44
55
6
7
8
9
10
11
12
13
14
15
16
17
618
719
820
......
1325
1426
1527
28
1629
1730
1831
# Remove logo from indefero
logo_re = re.compile(r'^.+/logo/$')
PLUGIN_CLASS = 'HTTP'
API_VERSION = 1
def get_plugins_infos():
infos = {'class' : PLUGIN_CLASS,
'min_version' : API_VERSION,
'max_version' : -1}
return infos
def load():
return True
# Basic rule to detect robots
def hook(hits):
for p in super_hit['pages']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1

Archive Download the corresponding diff file

Branches

Tags