iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import re
5import time
6import glob
7import imp
8import pickle
9import gzip
10
11from robots import awstats_robots;
12
13print '==> Start'
14
15meta_visit = {}
16analyse_started = False
17current_visits = {}
18cache_plugins = {}
19display = {}
20
21log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
22 '"$request" $status $body_bytes_sent ' +\
23 '"$http_referer" "$http_user_agent"';
24
25log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
26log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
27http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
28#09/Nov/2014:06:35:16 +0100
29time_format = '%d/%b/%Y:%H:%M:%S +0100'
30#print "Log format : " + log_format_extracted
31
32log_re = re.compile(log_format_extracted)
33uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
34pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
35viewed_http_codes = [200]
36
37HOOKS_ROOT = './hooks/'
38PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/*.py'
39POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/*.py'
40DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/*.py'
41DB_ROOT = './output/'
42DISPLAY_ROOT = './output/'
43META_PATH = DB_ROOT + 'meta.db'
44DB_FILENAME = 'iwla.db'
45
46print '==> Generating robot dictionary'
47
48awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
49
50def createEmptyVisits():
51 visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}}
52 return visits
53
54def createEmptyMeta():
55 meta = {'last_time' : None}
56 return meta
57
58def createEmptyDisplay():
59 display = {}
60 return display
61
62def createPage(filename, title):
63 page = {}
64 page['title'] = title;
65 page['blocks'] = []
66 display[filename] = page
67
68 return page
69
70def getDBFilename(time):
71 return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
72
73def serialize(obj, filename):
74 base = os.path.dirname(filename)
75 if not os.path.exists(base):
76 os.makedirs(base)
77
78 # TODO : remove return
79 return
80
81 with open(filename + '.tmp', 'wb+') as f:
82 pickle.dump(obj, f)
83 f.seek(0)
84 with gzip.open(filename, 'w') as fzip:
85 fzip.write(f.read())
86 os.remove(filename + '.tmp')
87
88def deserialize(filename):
89 if not os.path.exists(filename):
90 return None
91
92 with gzip.open(filename, 'r') as f:
93 return pickle.load(f)
94 return None
95
96def callPlugins(path, *kwargs):
97 print '==> Call plugins (%s)' % path
98 plugins = glob.glob(path)
99 plugins.sort()
100 for p in plugins:
101 print '\t%s' % (p)
102 if not p in cache_plugins:
103 mod = imp.load_source('hook', p)
104 cache_plugins[p] = mod
105 else:
106 mod = cache_plugins[p]
107 mod.hook(*kwargs)
108
109def isPage(request):
110 for e in pages_extensions:
111 if request.endswith(e):
112 return True
113
114 return False
115
116def appendHit(hit):
117 remote_addr = hit['remote_addr']
118
119 if not remote_addr in current_visits['visits'].keys():
120 createUser(hit)
121 return
122
123 super_hit = current_visits['visits'][remote_addr]
124 super_hit['pages'].append(hit)
125 super_hit['bandwith'] += int(hit['body_bytes_sent'])
126 super_hit['last_access'] = meta_visit['last_time']
127
128 request = hit['extract_request']
129
130 if 'extract_uri' in request.keys():
131 uri = request['extract_uri']
132 else:
133 uri = request['http_uri']
134
135 hit['is_page'] = isPage(uri)
136
137 # Don't count 3xx status
138 status = int(hit['status'])
139 if status >= 300 and status < 400: return
140
141 if super_hit['robot'] or\
142 not status in viewed_http_codes:
143 page_key = 'not_viewed_pages'
144 hit_key = 'not_viewed_hits'
145 else:
146 page_key = 'viewed_pages'
147 hit_key = 'viewed_hits'
148
149 if hit['is_page']:
150 super_hit[page_key] += 1
151 else:
152 super_hit[hit_key] += 1
153
154def createUser(hit):
155 super_hit = current_visits['visits'][hit['remote_addr']] = {}
156 super_hit['viewed_pages'] = 0;
157 super_hit['viewed_hits'] = 0;
158 super_hit['not_viewed_pages'] = 0;
159 super_hit['not_viewed_hits'] = 0;
160 super_hit['bandwith'] = 0;
161 super_hit['last_access'] = meta_visit['last_time']
162 super_hit['pages'] = [];
163 super_hit['robot'] = isRobot(hit);
164 appendHit(hit)
165
166def isRobot(hit):
167 for r in awstats_robots:
168 if r.match(hit['http_user_agent']):
169 return True
170 return False
171
172def decodeHTTPRequest(hit):
173 if not 'request' in hit.keys(): return False
174
175 groups = http_request_extracted.match(hit['request'])
176
177 if groups:
178 hit['extract_request'] = groups.groupdict()
179 uri_groups = uri_re.match(hit['extract_request']['http_uri']);
180 if uri_groups:
181 d = uri_groups.groupdict()
182 hit['extract_request']['extract_uri'] = d['extract_uri']
183 if 'extract_parameters' in d.keys():
184 hit['extract_request']['extract_parameters'] = d['extract_parameters']
185 else:
186 print "Bad request extraction " + hit['request']
187 return False
188
189 referer_groups = uri_re.match(hit['http_referer']);
190 if referer_groups:
191 referer = hit['extract_referer'] = referer_groups.groupdict()
192 return True
193
194def decodeTime(hit):
195 t = hit['time_local']
196
197 hit['time_decoded'] = time.strptime(t, time_format)
198
199def buildPages():
200 for filename in display.keys():
201 page = display[filename]
202 with open(DISPLAY_ROOT + filename, 'w') as f:
203 f.write('<html><title>%s</title><body>' % (page['title']))
204 for block in page['blocks']:
205 if block['type'] == 'html':
206 f.write(block['value'])
207 elif block['type'] == 'table':
208 f.write('<table>')
209 f.write('<tr>')
210 for title in block['cols']:
211 f.write('<th>%s</th>' % (title))
212 f.write('</tr>')
213 for row in block['rows']:
214 f.write('<tr>')
215 for v in row:
216 f.write('<td>%s</td>' % (v))
217 f.write('</tr>')
218 f.write('</table>')
219 f.write('</body></html>')
220
221def generateDisplayDaysStat():
222 cur_time = meta_visit['last_time']
223 title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
224 filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
225 page = createPage(filename, title)
226
227 days = {'type' : 'table', 'title' : 'By day'}
228 days['cols'] = ['Day', 'Visits', 'Pages', 'Hits', 'Bandwith', 'Robot Bandwith']
229 days['rows'] = []
230 keys = current_visits['days_stats'].keys()
231 keys.sort()
232 nb_visits = 0
233 for k in keys:
234 stats = current_visits['days_stats'][k]
235 row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
236 row = map(lambda(v): str(v), row)
237 days['rows'].append(row)
238 nb_visits += stats['nb_visitors']
239
240 stats = current_visits['month_stats']
241
242 nb_days = len(keys)
243 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
244 if nb_days:
245 row = map(lambda(v): str(int(v/nb_days)), row)
246 else:
247 row = map(lambda(v): '0', row)
248
249 row[0] = 'Average'
250 days['rows'].append(row)
251
252 row = ['Total', nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
253 row = map(lambda(v): str(v), row)
254 days['rows'].append(row)
255 page['blocks'].append(days)
256
257def generateDisplay():
258 generateDisplayDaysStat()
259 callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display)
260 buildPages()
261
262def generateStats(visits):
263 stats = {}
264 stats['viewed_bandwidth'] = 0
265 stats['not_viewed_bandwidth'] = 0
266 stats['viewed_pages'] = 0
267 stats['viewed_hits'] = 0
268 #stats['pages'] = set()
269 stats['nb_visitors'] = 0
270
271 for k in visits.keys():
272 super_hit = visits[k]
273 if super_hit['robot']:
274 stats['not_viewed_bandwidth'] += super_hit['bandwith']
275 continue
276
277 print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
278
279 stats['nb_visitors'] += 1
280 stats['viewed_bandwidth'] += super_hit['bandwith']
281 stats['viewed_pages'] += super_hit['viewed_pages']
282 stats['viewed_hits'] += super_hit['viewed_hits']
283
284 # for p in super_hit['pages']:
285 # if not p['is_page']: continue
286 # req = p['extract_request']
287 # stats['pages'].add(req['extract_uri'])
288
289 return stats
290
291def generateMonthStats():
292 display = createEmptyDisplay()
293
294 visits = current_visits['visits']
295
296 stats = generateStats(visits)
297
298 cur_time = meta_visit['last_time']
299 print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
300 print stats
301
302 valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
303 callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
304
305 current_visits['month_stats'] = stats
306
307 path = getDBFilename(cur_time)
308 if os.path.exists(path):
309 os.remove(path)
310
311 print "==> Serialize to %s" % path
312
313 serialize(current_visits, path)
314
315 generateDisplay()
316
317def generateDayStats():
318 visits = current_visits['visits']
319
320 callPlugins(PRE_HOOK_DIRECTORY, visits)
321
322 stats = generateStats(visits)
323
324 cur_time = meta_visit['last_time']
325 print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
326
327 if cur_time.tm_mday > 1:
328 last_day = cur_time.tm_mday - 1
329 while last_day:
330 if last_day in current_visits['days_stats'].keys():
331 break
332 last_day -= 1
333 if last_day:
334 for k in stats.keys():
335 stats[k] -= current_visits['days_stats'][last_day][k]
336 stats['nb_visitors'] = 0
337 for k in visits.keys():
338 if visits[k]['robot']: continue
339 if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
340 stats['nb_visitors'] += 1
341 print stats
342
343 current_visits['days_stats'][cur_time.tm_mday] = stats
344
345def newHit(hit):
346 global current_visits
347 global analyse_started
348
349 decodeTime(hit)
350
351 t = hit['time_decoded']
352
353 cur_time = meta_visit['last_time']
354
355 if cur_time == None:
356 current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
357 analyse_started = True
358 else:
359 if not analyse_started:
360 if time.mktime(cur_time) >= time.mktime(t):
361 return
362 else:
363 analyse_started = True
364 current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
365 if cur_time.tm_mon != t.tm_mon:
366 generateMonthStats()
367 current_visits = deserialize(getDBFilename(t)) or createEmptyVisits()
368 elif cur_time.tm_mday != t.tm_mday:
369 generateDayStats()
370
371 meta_visit['last_time'] = t
372
373 if not decodeHTTPRequest(hit): return False
374
375 for k in hit.keys():
376 if hit[k] == '-': hit[k] = ''
377
378 appendHit(hit)
379
380 return True
381
382print '==> Analysing log'
383
384meta_visit = deserialize(META_PATH) or createEmptyMeta()
385
386current_visits = createEmptyVisits()
387
388f = open("access.log")
389for l in f:
390 # print "line " + l;
391
392 groups = log_re.match(l)
393
394 if groups:
395 if not newHit(groups.groupdict()):
396 break
397 else:
398 print "No match " + l
399f.close();
400
401if analyse_started:
402 generateDayStats()
403 generateMonthStats()
404 serialize(meta_visit, META_PATH)
405else:
406 print '==> Analyse not started : nothing to do'

Archive Download this file

Branches

Tags