iwla

iwla Commit Details

Date:2014-11-19 19:34:16 (6 years 8 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:888b481b1dd2ee05d6bd087e3bb7122d26e33f77
Parents: db965a8070204d272a002ea22430b5fa610b0b42
Message:On r715

Changes:
Dhooks_pre/H001_robot.py (full)
Dhooks_pre/H002_soutade.py (full)
Miwla.py (5 diffs)

File differences

hooks/pre_analysis/H001_robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
isRobot = False
referers = 0
# 1) no pages view --> robot
if not super_hit['viewed_pages']:
super_hit['robot'] = 1
continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
super_hit['robot'] = 1
continue
for hit in super_hit['pages']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'] == '/robots.txt':
isRobot = True
break
# 4) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
if isRobot:
super_hit['robot'] = 1
continue
if super_hit['viewed_hits'] and not referers:
super_hit['robot'] = 1
continue
hooks/pre_analysis/H002_soutade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re
# Remove logo from indefero
logo_re = re.compile(r'^.+/logo/$')
# Basic rule to detect robots
def hook(hits):
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['pages']:
if not p['is_page']: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
hooks_pre/H001_robot.py
1
../plugins/hooks_pre/H001_robot.py
hooks_pre/H002_soutade.py
1
../plugins/hooks_pre/H002_soutade.py
iwla.py
55
66
77
8
9
810
911
1012
1113
12
14
15
1316
1417
1518
......
2831
2932
3033
31
32
33
34
34
35
36
37
38
3539
3640
3741
3842
3943
40
41
42
43
44
45
46
44
45
4746
48
49
50
51
52
47
48
49
50
5351
54
55
56
52
53
54
55
56
57
5758
58
59
60
61
59
60
61
6262
63
63
64
65
6466
6567
6668
......
153155
154156
155157
156
157
158
159
158160
159
161
160162
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
163203
164204
165205
166206
167207
168
208
169209
170210
171
211
212
213
172214
173
174
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
175231
176232
177233
......
182238
183239
184240
241
242
243
244
245
185246
186247
187248
......
195256
196257
197258
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
259
260
import time
import glob
import imp
import pickle
import gzip
from robots import awstats_robots;
print '==> Start'
meta_visit = {}
meta_visit = {'last_time':None}
analyse_started = False
current_visit = {}
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
viewed_http_codes = [200]
cur_time = None
PRE_HOOK_DIRECTORY = './hooks_pre/*.py'
POST_HOOK_DIRECTORY = './hooks_post/*.py'
PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py'
POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py'
DB_ROOT = './output/'
META_PATH = DB_ROOT + 'meta.db'
DB_FILENAME = 'iwla.db'
print '==> Generating robot dictionary'
awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
def generate_day_stats():
days_stats = {}
days_stats['viewed_bandwidth'] = 0
days_stats['not_viewed_bandwidth'] = 0
days_stats['viewed_pages'] = 0
days_stats['viewed_hits'] = 0
days_stats['pages'] = set()
def get_db_filename(time):
return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
for k in current_visit.keys():
super_hit = current_visit[k]
if super_hit['robot']:
days_stats['not_viewed_bandwidth'] += super_hit['bandwith']
continue
def serialize(obj, filename):
base = os.path.dirname(filename)
if not os.path.exists(base):
os.makedirs(base)
days_stats['viewed_bandwidth'] += super_hit['bandwith']
days_stats['viewed_pages'] += super_hit['viewed_pages']
days_stats['viewed_hits'] += super_hit['viewed_hits']
with open(filename + '.tmp', 'wb+') as f:
pickle.dump(obj, f)
f.seek(0)
with gzip.open(filename, 'w') as fzip:
fzip.write(f.read())
os.remove(filename + '.tmp')
for p in super_hit['pages']:
if not p['is_page']: continue
req = p['extract_request']
days_stats['pages'].add(req['extract_uri'])
def deserialize(filename):
if not os.path.exists(filename):
return None
return days_stats
with gzip.open(filename, 'r') as f:
return pickle.load(f)
return None
def call_plugins(path, *kwargs):
print '==> Call plugins (%s)' % path
hit['time_decoded'] = time.strptime(t, time_format)
def newHit(hit):
global cur_time
def generate_month_stats():
call_plugins(PRE_HOOK_DIRECTORY, current_visit)
if not decode_http_request(hit): return
valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']}
for k in hit.keys():
if hit[k] == '-': hit[k] = ''
call_plugins(POST_HOOK_DIRECTORY, valid_visitors)
stats = {}
stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
stats['pages'] = set()
for k in current_visit.keys():
super_hit = current_visit[k]
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwith']
continue
stats['viewed_bandwidth'] += super_hit['bandwith']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
for p in super_hit['pages']:
if not p['is_page']: continue
req = p['extract_request']
stats['pages'].add(req['extract_uri'])
cur_time = meta_visit['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats
path = get_db_filename(cur_time)
if os.path.exists(path):
os.remove(path)
print "==> Serialize to %s" % path
serialize(current_visit, path)
def newHit(hit):
global current_visit
global analyse_started
decode_time(hit)
t = hit['time_decoded']
meta_visit['last_time'] = t
cur_time = meta_visit['last_time']
if cur_time == None:
cur_time = t
current_visit = deserialize(get_db_filename(t))
if not current_visit: current_visit = {}
analyse_started = True
else:
if cur_time.tm_mday != t.tm_mday:
return False
if not analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
return
else:
analyse_started = True
if cur_time.tm_mon != t.tm_mon:
generate_month_stats()
current_visit = deserialize(get_db_filename(t))
if not current_visit: current_visit = {}
meta_visit['last_time'] = t
if not decode_http_request(hit): return False
for k in hit.keys():
if hit[k] == '-': hit[k] = ''
remote_addr = hit['remote_addr']
if remote_addr in current_visit.keys():
return True
print '==> Analysing log'
meta_visit = deserialize(META_PATH)
if not meta_visit:
meta_visit = {'last_time':None}
f = open("access.log")
for l in f:
# print "line " + l;
print "No match " + l
f.close();
call_plugins(PRE_HOOK_DIRECTORY, current_visit)
stats = generate_day_stats()
print stats
valid_visitors = {k: v for (k,v) in current_visit.items() if not current_visit[k]['robot']}
#print valid_visitors
# for ip in current_visit.keys():
# hit = current_visit[ip]
# if hit['robot']: continue
# print "%s =>" % (ip)
# for k in hit.keys():
# if k != 'pages':
# print "\t%s : %s" % (k, current_visit[ip][k])
call_plugins(POST_HOOK_DIRECTORY, valid_visitors)
generate_month_stats()
serialize(meta_visit, META_PATH)

Archive Download the corresponding diff file

Branches

Tags