iwla

iwla Commit Details

Date:2014-11-21 10:41:29 (6 years 8 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:7dada493abd0068bac180572621b43ddcbfd2f4b
Parents: 34aec57c46a8791b8bdee8c192afad71430413df
Message:Plugins OK

Changes:
Mconf.py (1 diff)
Mdisplay.py (2 diffs)
Miwla.py (17 diffs)
Mplugins/pre_analysis/H001_robot.py (2 diffs)
Mplugins/pre_analysis/H002_soutade.py (2 diffs)

File differences

conf.py
1111
1212
1313
14
14
15
16
17
18
19
20
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
pre_analysis_hooks = ['H002_soutade', 'H001_robot']
post_analysis_hooks = ['top_visitors']
display_hooks = ['top_visitors']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py']
display.py
1
12
23
34
......
1415
1516
1617
18
19
1720
1821
1922
20
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
2138
2239
23
40
41
2442
2543
44
45
46
2647
2748
2849
29
30
31
32
33
34
35
36
37
38
39
50
4051
def createPage(display, filename, title):
page = {}
page['title'] = title;
table['cols'] = cols
table['rows'] = []
return table
def appendRowToTable(table, row):
table['rows'].append(row)
def buildPages(display):
def buildTable(block, f):
print 'Write table %s' % block['title']
f.write('<table>')
f.write('<tr>')
for title in block['cols']:
f.write('<th>%s</th>' % (title))
f.write('</tr>')
for row in block['rows']:
f.write('<tr>')
for v in row:
f.write('<td>%s</td>' % (v))
f.write('</tr>')
f.write('</table>')
def buildPages(display_root, display):
for filename in display.keys():
page = display[filename]
with open(DISPLAY_ROOT + filename, 'w') as f:
print "OPEN %s" % (display_root + filename)
with open(display_root + filename, 'w') as f:
f.write('<html><title>%s</title><body>' % (page['title']))
for block in page['blocks']:
print "Bluid block"
print block
print "End block"
if block['type'] == 'html':
f.write(block['value'])
elif block['type'] == 'table':
f.write('<table>')
f.write('<tr>')
for title in block['cols']:
f.write('<th>%s</th>' % (title))
f.write('</tr>')
for row in block['rows']:
f.write('<tr>')
for v in row:
f.write('<td>%s</td>' % (v))
f.write('</tr>')
f.write('</table>')
buildTable(block, f)
f.write('</body></html>')
iwla.py
1717
1818
1919
20
20
2121
2222
2323
......
3535
3636
3737
38
38
3939
4040
4141
......
5757
5858
5959
60
6061
6162
6263
6364
64
65
66
67
68
69
70
71
6572
6673
6774
......
7178
7279
7380
74
75
81
82
7683
7784
7885
......
113120
114121
115122
116
123
117124
118125
119126
120127
121
128
122129
123130
124131
......
135142
136143
137144
138
139
145
146
140147
141148
142149
......
167174
168175
169176
170
171
172
173
174
177
178
179
180
181
182
175183
176
184
177185
178
186
179187
180188
181189
......
185193
186194
187195
188
196
189197
190198
191199
......
195203
196204
197205
198
206
199207
200208
201209
......
205213
206214
207215
216
217
218
219
220
221
208222
209223
210224
211225
212226
213227
214
228
215229
216230
217231
......
243257
244258
245259
246
260
247261
248262
249263
......
251265
252266
253267
254
268
255269
256270
257271
258272
259273
260
274
261275
262276
263
277
264278
265279
266280
267
281
268282
269283
270284
271
285
272286
273287
274
288
275289
276290
277291
......
287301
288302
289303
290
304
291305
292306
293307
......
348362
349363
350364
351
352365
353366
354367
......
371384
372385
373386
374
375
387
388
389
390
376391
377392
378393
379
394
380395
381396
382397
......
385400
386401
387402
388
403
389404
390405
391406
......
393408
394409
395410
411
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
'"$request" $status $body_bytes_sent ' +\
'"$http_referer" "$http_user_agent"';
'"$http_referer" "$http_user_agent"'
time_format = '%d/%b/%Y:%H:%M:%S +0100'
cache_plugins = {}
display = {}
log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format)
log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
API_VERSION = 1
def preloadPlugins():
ret = True
for root in plugins.keys():
for plugin_name in plugins[root]:
p = root + '/' + plugin_name
try:
mod = cache_plugins[p] = imp.load_source('hook', p)
fp, pathname, description = imp.find_module(plugin_name, [root])
cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description)
#cache_plugins[p] = imp.load_module(p,None,p,("py","r",imp.PKG_DIRECTORY))
#cache_plugins[p] = imp.load_source(p, p)
mod = cache_plugins[p]
#print dir(mod)
#print "Register %s -> %s" % (p, mod)
infos = mod.get_plugins_infos()
if infos['class'] != ANALYSIS_CLASS or \
API_VERSION < infos['min_version'] or\
del cache_plugins[p]
except Exception as e:
print 'Error loading \'%s\' => %s' % (p, e)
return False
return True
ret = False
return ret
def createEmptyVisits():
return pickle.load(f)
return None
def callPlugins(root, *kwargs):
def callPlugins(root, *args):
print '==> Call plugins (%s)' % root
for p in plugins[root]:
print '\t%s' % (p)
mod = cache_plugins[root + '/' + p]
mod.hook(*kwargs)
mod.hook(*args)
def isPage(request):
for e in pages_extensions:
return
super_hit = current_visits['visits'][remote_addr]
super_hit['pages'].append(hit)
super_hit['bandwith'] += int(hit['body_bytes_sent'])
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = meta_visit['last_time']
request = hit['extract_request']
def createUser(hit):
super_hit = current_visits['visits'][hit['remote_addr']] = {}
super_hit['viewed_pages'] = 0;
super_hit['viewed_hits'] = 0;
super_hit['not_viewed_pages'] = 0;
super_hit['not_viewed_hits'] = 0;
super_hit['bandwith'] = 0;
super_hit['remote_addr'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['last_access'] = meta_visit['last_time']
super_hit['pages'] = [];
super_hit['requests'] = []
super_hit['robot'] = False
super_hit['hit_only'] = 0;
super_hit['hit_only'] = 0
appendHit(hit)
def decodeHTTPRequest(hit):
if groups:
hit['extract_request'] = groups.groupdict()
uri_groups = uri_re.match(hit['extract_request']['http_uri']);
uri_groups = uri_re.match(hit['extract_request']['http_uri'])
if uri_groups:
d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri']
print "Bad request extraction " + hit['request']
return False
referer_groups = uri_re.match(hit['http_referer']);
referer_groups = uri_re.match(hit['http_referer'])
if referer_groups:
referer = hit['extract_referer'] = referer_groups.groupdict()
return True
hit['time_decoded'] = time.strptime(t, time_format)
def getDisplayIndex():
cur_time = meta_visit['last_time']
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
return display.get(filename, None)
def generateDisplayDaysStat():
cur_time = meta_visit['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
page = createPage(display, filename, title)
days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwith', 'Robot Bandwith'])
days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth'])
keys = current_visits['days_stats'].keys()
keys.sort()
def generateDisplay():
generateDisplayDaysStat()
callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display)
buildPages()
buildPages(DISPLAY_ROOT, display)
def generateStats(visits):
stats = {}
stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0
stats['viewed_hits'] = 0
#stats['pages'] = set()
#stats['requests'] = set()
stats['nb_visitors'] = 0
for k in visits.keys():
super_hit = visits[k]
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwith']
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if not super_hit['hit_only']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwith']
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
# for p in super_hit['pages']:
# for p in super_hit['requests']:
# if not p['is_page']: continue
# req = p['extract_request']
# stats['pages'].add(req['extract_uri'])
# stats['requests'].add(req['extract_uri'])
return stats
print stats
valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats)
current_visits['month_stats'] = stats
return
else:
analyse_started = True
current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
if cur_time.tm_mon != t.tm_mon:
generateMonthStats()
current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
print '==> Analysing log'
meta_visit = deserialize(META_PATH) or createEmptyMeta()
current_visits = createEmptyVisits()
if meta_visit['last_time']:
current_visits = deserialize(getDBFilename(meta_visit['last_time'])) or createEmptyVisits()
else:
current_visits = createEmptyVisits()
f = open(analyzed_filename)
for l in f:
# print "line " + l;
# print "line " + l
groups = log_re.match(l)
break
else:
print "No match " + l
f.close();
f.close()
if analyse_started:
generateDayStats()
serialize(meta_visit, META_PATH)
else:
print '==> Analyse not started : nothing to do'
generateMonthStats()
plugins/pre_analysis/H001_robot.py
3030
3131
3232
33
34
35
36
33
34
35
36
37
38
3739
3840
3941
......
4547
4648
4749
48
50
4951
5052
5153
isRobot = False
referers = 0
for r in awstats_robots:
if r.match(super_hit['pages'][0]['http_user_agent']):
super_hit['robot'] = 1
continue
first_page = super_hit['requests'][0]
if first_page['time_decoded'].tm_mday == super_hit['last_access'].tm_mday:
for r in awstats_robots:
if r.match(first_page['http_user_agent']):
super_hit['robot'] = 1
continue
# 1) no pages view --> robot
if not super_hit['viewed_pages']:
super_hit['robot'] = 1
continue
for hit in super_hit['pages']:
for hit in super_hit['requests']:
# 3) /robots.txt read
if hit['extract_request']['http_uri'] == '/robots.txt':
isRobot = True
plugins/pre_analysis/H002_soutade.py
77
88
99
10
11
12
10
11
12
13
14
1315
1416
1517
......
2325
2426
2527
26
28
2729
2830
31
2932
3033
3134
API_VERSION = 1
def get_plugins_infos():
infos = {'class' : PLUGIN_CLASS,
'min_version' : API_VERSION,
'max_version' : -1}
infos = {
'class' : PLUGIN_CLASS,
'min_version' : API_VERSION,
'max_version' : -1
}
return infos
def load():
if super_hit['robot']: continue
for p in super_hit['pages']:
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
if logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1

Archive Download the corresponding diff file

Branches

Tags