iwla

iwla Commit Details

Date:2014-11-19 08:01:12 (6 years 8 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:b1549ca8845a03423a562fdb3b72ac558a356936
Parents: 26688e4bf7c1bda7db98a0ca65fa09072f8b0113
Message:Initial commit

Changes:
Ahooks_pre/H001_robot.py (full)
Ahooks_pre/H002_soutade.py (full)
Aplugins/hooks_pre/H001_robot.py (full)
Aplugins/hooks_pre/H002_soutade.py (full)
Miwla.py (8 diffs)

File differences

hooks_pre/H001_robot.py
1
../plugins/hooks_pre/H001_robot.py
hooks_pre/H002_soutade.py
1
../plugins/hooks_pre/H002_soutade.py
iwla.py
99
1010
1111
12
1213
1314
1415
......
2324
2425
2526
26
27
2728
2829
2930
3031
3132
33
34
35
3236
3337
3438
3539
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
3674
3775
3876
......
70108
71109
72110
73
111
74112
75113
76114
......
78116
79117
80118
81
82
83
84
85
86
119
87120
88121
89122
......
101134
102135
103136
104
105
137
138
139
140
106141
107142
108143
109144
110145
111146
112
113
147
114148
115149
116150
......
131165
132166
133167
134
168
135169
136170
137171
......
143177
144178
145179
146
180
147181
148182
149183
......
161195
162196
163197
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
print '==> Start'
meta_visit = {}
current_visit = {}
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
#print "Log format : " + log_format_extracted
log_re = re.compile(log_format_extracted)
uri_re = re.compile(r'(?P<extract_uri>[^\?]*)\?(?P<extract_parameters>.*)')
uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200]
cur_time = None
PRE_HOOK_DIRECTORY = './hooks_pre/*.py'
POST_HOOK_DIRECTORY = './hooks_post/*.py'
print '==> Generating robot dictionary'
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
def generate_day_stats():
days_stats = {}
days_stats['viewed_bandwidth'] = 0
days_stats['not_viewed_bandwidth'] = 0
days_stats['viewed_pages'] = 0
days_stats['viewed_hits'] = 0
days_stats['pages'] = set()
for k in current_visit.keys():
super_hit = current_visit[k]
if super_hit['robot']:
days_stats['not_viewed_bandwidth'] += super_hit['bandwith']
continue
days_stats['viewed_bandwidth'] += super_hit['bandwith']
days_stats['viewed_pages'] += super_hit['viewed_pages']
days_stats['viewed_hits'] += super_hit['viewed_hits']
for p in super_hit['pages']:
if not p['is_page']: continue
req = p['extract_request']
days_stats['pages'].add(req['extract_uri'])
return days_stats
def call_plugins(path, *kwargs):
print '==> Call plugins (%s)' % path
plugins = glob.glob(path)
plugins.sort()
for p in plugins:
print '\t%s' % (p)
mod = imp.load_source('hook', p)
mod.hook(*kwargs)
def isPage(request):
for e in pages_extensions:
if request.endswith(e):
else:
super_hit[hit_key] += 1
def createGeneric(hit):
def createUser(hit):
super_hit = current_visit[hit['remote_addr']] = {}
super_hit['viewed_pages'] = 0;
super_hit['viewed_hits'] = 0;
super_hit['not_viewed_hits'] = 0;
super_hit['bandwith'] = 0;
super_hit['pages'] = [];
return super_hit
def createUser(hit, robot):
super_hit = createGeneric(hit)
super_hit['robot'] = robot;
super_hit['robot'] = isRobot(hit);
appendHit(hit)
def isRobot(hit):
hit['extract_request'] = groups.groupdict()
uri_groups = uri_re.match(hit['extract_request']['http_uri']);
if uri_groups:
hit['extract_request']['extract_uri'] = uri_groups.group('extract_uri')
hit['extract_request']['extract_parameters'] = uri_groups.group('extract_parameters')
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
if 'extract_parameters' in d.keys():
hit['extract_request']['extract_parameters'] = d['extract_parameters']
else:
print "Bad request extraction " + hit['request']
return False
referer_groups = uri_re.match(hit['http_referer']);
if referer_groups:
hit['extract_referer']['extract_uri'] = referer_groups.group('extract_uri')
hit['extract_referer']['extract_parameters'] = referer_groups.group('extract_parameters')
referer = hit['extract_referer'] = referer_groups.groupdict()
return True
def decode_time(hit):
t = hit['time_decoded']
current_visit['last_time'] = t
meta_visit['last_time'] = t
if cur_time == None:
cur_time = t
if remote_addr in current_visit.keys():
appendHit(hit)
else:
createUser(hit, isRobot(hit))
createUser(hit)
return True
print "No match " + l
f.close();
print '==> Call plugins'
plugins = glob.glob('./hooks_pre/*.py')
plugins.sort()
for p in plugins:
print '\t%s' % (p)
mod = imp.load_source('hook', p)
mod.hook(current_visit)
for ip in current_visit.keys():
hit = current_visit[ip]
if hit['robot']: continue
print "%s =>" % (ip)
for k in hit.keys():
if k != 'pages':
print "\t%s : %s" % (k, current_visit[ip][k])
call_plugins(PRE_HOOK_DIRECTORY, current_visit)
stats = generate_day_stats()
print stats
valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']}
#print valid_visitors
# for ip in current_visit.keys():
# hit = current_visit[ip]
# if hit['robot']: continue
# print "%s =>" % (ip)
# for k in hit.keys():
# if k != 'pages':
# print "\t%s : %s" % (k, current_visit[ip][k])
call_plugins(POST_HOOK_DIRECTORY, valid_visitors)
plugins/hooks_pre/H001_robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
isRobot = False
referers = 0
# 1) no pages view --> robot
if not super_hit['viewed_pages']:
super_hit['robot'] = 1
continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
super_hit['robot'] = 1
continue
for hit in super_hit['pages']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'] == '/robots.txt':
isRobot = True
break
# 4) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
if isRobot:
super_hit['robot'] = 1
continue
if super_hit['viewed_hits'] and not referers:
super_hit['robot'] = 1
continue
plugins/hooks_pre/H002_soutade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re
# Remove logo from indefero
logo_re = re.compile(r'^.+/logo/$')
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['pages']:
if not p['is_page']: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1

Archive Download the corresponding diff file

Branches

Tags