iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import re
5import time
6import pickle
7import gzip
8import importlib
9from calendar import monthrange
10
11import default_conf as conf
12import conf as _
13conf.__dict__.update(_.__dict__)
14del _
15
16from iplugin import *
17from display import *
18
19class IWLA(object):
20
21 ANALYSIS_CLASS = 'HTTP'
22 API_VERSION = 1
23
24 def __init__(self):
25 print '==> Start'
26
27 self.meta_infos = {}
28 self.analyse_started = False
29 self.current_analysis = {}
30 self.cache_plugins = {}
31 self.display = DisplayHTMLBuild(self)
32 self.valid_visitors = None
33
34 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
35 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
36 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
37 self.log_re = re.compile(self.log_format_extracted)
38 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
39 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
40 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
41 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
42
43 def getConfValue(self, key, default=None):
44 if not key in dir(conf):
45 return default
46 else:
47 return conf.__dict__[key]
48
49 def _clearVisits(self):
50 self.current_analysis = {
51 'days_stats' : {},
52 'month_stats' : {},
53 'visits' : {}
54 }
55 self.valid_visitors = None
56 return self.current_analysis
57
58 def getDaysStats(self):
59 return self.current_analysis['days_stats']
60
61 def getMonthStats(self):
62 return self.current_analysis['month_stats']
63
64 def getCurrentVisists(self):
65 return self.current_analysis['visits']
66
67 def getValidVisitors(self):
68 return self.valid_visitors
69
70 def getDisplay(self):
71 return self.display
72
73 def getCurTime(self):
74 return self.meta_infos['last_time']
75
76 def getStartAnalysisTime(self):
77 return self.meta_infos['start_analysis_time']
78
79 def isValidForCurrentAnalysis(self, request):
80 cur_time = self.meta_infos['start_analysis_time']
81 # Analyse not started
82 if not cur_time: return False
83 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
84
85 def hasBeenViewed(self, request):
86 return int(request['status']) in conf.viewed_http_codes
87
88 def getCurDisplayPath(self, filename):
89 cur_time = self.meta_infos['last_time']
90 return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
91
92 def getResourcesPath(self):
93 return conf.resources_path
94
95 def getCSSPath(self):
96 return conf.css_path
97
98 def _clearMeta(self):
99 self.meta_infos = {
100 'last_time' : None
101 }
102 return self.meta_infos
103
104 def _clearDisplay(self):
105 self.display = DisplayHTMLBuild(self)
106return self.display
107
108 def getDBFilename(self, time):
109 return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
110
111 def _serialize(self, obj, filename):
112 base = os.path.dirname(filename)
113 if not os.path.exists(base):
114 os.makedirs(base)
115
116 # TODO : remove return
117 #return
118
119 with open(filename + '.tmp', 'wb+') as f:
120 pickle.dump(obj, f)
121 f.seek(0)
122 with gzip.open(filename, 'w') as fzip:
123 fzip.write(f.read())
124 os.remove(filename + '.tmp')
125
126 def _deserialize(self, filename):
127 if not os.path.exists(filename):
128 return None
129
130 with gzip.open(filename, 'r') as f:
131 return pickle.load(f)
132 return None
133
134 def _callPlugins(self, target_root, *args):
135 print '==> Call plugins (%s)' % target_root
136 for (root, plugins) in self.plugins:
137 if root != target_root: continue
138 for p in plugins:
139 mod = self.cache_plugins.get(root + '.' + p, None)
140 if mod:
141 print '\t%s' % (p)
142 mod.hook(*args)
143
144 def isPage(self, request):
145 for e in conf.pages_extensions:
146 if request.endswith(e):
147 return True
148
149 return False
150
151 def _appendHit(self, hit):
152 remote_addr = hit['remote_addr']
153
154 if not remote_addr: return
155
156 if not remote_addr in self.current_analysis['visits'].keys():
157 self._createVisitor(hit)
158 return
159
160 super_hit = self.current_analysis['visits'][remote_addr]
161 super_hit['requests'].append(hit)
162 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
163 super_hit['last_access'] = self.meta_infos['last_time']
164
165 request = hit['extract_request']
166
167 uri = request.get('extract_uri', request['http_uri'])
168
169 hit['is_page'] = self.isPage(uri)
170
171 status = int(hit['status'])
172 if status not in conf.viewed_http_codes:
173 return
174
175 if super_hit['robot'] or\
176 not status in conf.viewed_http_codes:
177 page_key = 'not_viewed_pages'
178 hit_key = 'not_viewed_hits'
179 else:
180 page_key = 'viewed_pages'
181 hit_key = 'viewed_hits'
182
183 if hit['is_page']:
184 super_hit[page_key] += 1
185 else:
186 super_hit[hit_key] += 1
187
188 def _createVisitor(self, hit):
189 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
190 super_hit['remote_addr'] = hit['remote_addr']
191 super_hit['remote_ip'] = hit['remote_addr']
192 super_hit['viewed_pages'] = 0
193 super_hit['viewed_hits'] = 0
194 super_hit['not_viewed_pages'] = 0
195 super_hit['not_viewed_hits'] = 0
196 super_hit['bandwidth'] = 0
197 super_hit['last_access'] = self.meta_infos['last_time']
198 super_hit['requests'] = []
199 super_hit['robot'] = False
200 super_hit['hit_only'] = 0
201 self._appendHit(hit)
202
203 def _decodeHTTPRequest(self, hit):
204 if not 'request' in hit.keys(): return False
205
206 groups = self.http_request_extracted.match(hit['request'])
207
208 if groups:
209 hit['extract_request'] = groups.groupdict()
210 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
211 if uri_groups:
212 d = uri_groups.groupdict()
213 hit['extract_request']['extract_uri'] = d['extract_uri']
214 if 'extract_parameters' in d.keys():
215 hit['extract_request']['extract_parameters'] = d['extract_parameters']
216 else:
217 print "Bad request extraction " + hit['request']
218 return False
219
220 if hit['http_referer']:
221 referer_groups = self.uri_re.match(hit['http_referer'])
222 if referer_groups:
223 hit['extract_referer'] = referer_groups.groupdict()
224 return True
225
226 def _decodeTime(self, hit):
227 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
228 return hit['time_decoded']
229
230 def getDisplayIndex(self):
231 cur_time = self.meta_infos['last_time']
232 filename = self.getCurDisplayPath('index.html')
233
234 return self.display.getPage(filename)
235
236 def _generateDisplayDaysStat(self):
237 cur_time = self.meta_infos['last_time']
238 title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
239 filename = self.getCurDisplayPath('index.html')
240 print '==> Generate display (%s)' % (filename)
241 page = DisplayHTMLPage(title, filename, conf.css_path)
242
243 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
244 days = DisplayHTMLBlockTableWithGraph('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], nb_valid_rows=nb_month_days)
245 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
246 nb_visits = 0
247 nb_days = 0
248 for i in range(0, nb_month_days+1):
249 if i in self.current_analysis['days_stats'].keys():
250 stats = self.current_analysis['days_stats'][i]
251 row = [i, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
252 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
253 nb_visits += stats['nb_visitors']
254 nb_days += 1
255 else:
256 row = [i, 0, 0, 0, 0, 0]
257 days.appendRow(row)
258 days.setCellValue(i, 4, bytesToStr(row[4]))
259 days.setCellValue(i, 5, bytesToStr(row[5]))
260 days.appendShortTitle(str(i))
261
262 stats = self.current_analysis['month_stats']
263
264 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
265 if nb_days:
266 average_row = map(lambda(v): int(v/nb_days), row)
267 else:
268 average_row = map(lambda(v): 0, row)
269
270 average_row[0] = 'Average'
271 average_row[4] = bytesToStr(average_row[4])
272 average_row[5] = bytesToStr(average_row[5])
273 days.appendRow(average_row)
274
275 row[0] = 'Total'
276 row[4] = bytesToStr(row[4])
277 row[5] = bytesToStr(row[5])
278 days.appendRow(row)
279 page.appendBlock(days)
280 self.display.addPage(page)
281
282 def _generateDisplay(self):
283 self._generateDisplayDaysStat()
284 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
285 self.display.build(conf.DISPLAY_ROOT)
286
287 def _generateStats(self, visits):
288 stats = {}
289 stats['viewed_bandwidth'] = 0
290 stats['not_viewed_bandwidth'] = 0
291 stats['viewed_pages'] = 0
292 stats['viewed_hits'] = 0
293 #stats['requests'] = set()
294 stats['nb_visitors'] = 0
295
296 for (k, super_hit) in visits.items():
297 if super_hit['robot']:
298 stats['not_viewed_bandwidth'] += super_hit['bandwidth']
299 continue
300
301 #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
302
303 if conf.count_hit_only_visitors or\
304 super_hit['viewed_pages']:
305 stats['nb_visitors'] += 1
306 stats['viewed_bandwidth'] += super_hit['bandwidth']
307 stats['viewed_pages'] += super_hit['viewed_pages']
308 stats['viewed_hits'] += super_hit['viewed_hits']
309
310 # for p in super_hit['requests']:
311 # if not p['is_page']: continue
312 # req = p['extract_request']
313 # stats['requests'].add(req['extract_uri'])
314
315 return stats
316
317 def _generateMonthStats(self):
318 self._clearDisplay()
319
320 visits = self.current_analysis['visits']
321
322 stats = self._generateStats(visits)
323 duplicated_stats = {k:v for (k,v) in stats.items()}
324
325 cur_time = self.meta_infos['last_time']
326 print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
327 print stats
328
329 self.current_analysis['month_stats'] = stats
330
331 self.valid_visitors = {}
332 for (k,v) in visits.items():
333 if v['robot']: continue
334 if conf.count_hit_only_visitors and\
335 (not v['viewed_pages']):
336 continue
337 self.valid_visitors[k] = v
338
339 self._callPlugins(conf.POST_HOOK_DIRECTORY)
340
341 path = self.getDBFilename(cur_time)
342 if os.path.exists(path):
343 os.remove(path)
344
345 print "==> Serialize to %s" % path
346
347 self._serialize(self.current_analysis, path)
348
349 self._generateDisplay()
350
351 # Save month stats
352 year = '%d' % (cur_time.tm_year)
353 month = '%d' % (cur_time.tm_mon)
354 if not 'stats' in self.meta_infos.keys():
355 self.meta_infos['stats'] = {}
356 if not year in self.meta_infos['stats'].keys():
357 self.meta_infos['stats'][year] = {}
358 self.meta_infos['stats'][year][month] = duplicated_stats
359
360 def _generateDayStats(self):
361 visits = self.current_analysis['visits']
362
363 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
364
365 stats = self._generateStats(visits)
366
367 cur_time = self.meta_infos['last_time']
368 print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
369
370 if cur_time.tm_mday > 1:
371 last_day = cur_time.tm_mday - 1
372 while last_day:
373 if last_day in self.current_analysis['days_stats'].keys():
374 break
375 last_day -= 1
376 if last_day:
377 for k in stats.keys():
378 stats[k] -= self.current_analysis['days_stats'][last_day][k]
379 stats['nb_visitors'] = 0
380 for (k,v) in visits.items():
381 if v['robot']: continue
382 if conf.count_hit_only_visitors and\
383 (not v['viewed_pages']):
384 continue
385 if v['last_access'].tm_mday == cur_time.tm_mday:
386 stats['nb_visitors'] += 1
387 print stats
388
389 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
390
391 def _newHit(self, hit):
392 t = self._decodeTime(hit)
393
394 cur_time = self.meta_infos['last_time']
395
396 if cur_time == None:
397 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
398 self.analyse_started = True
399 else:
400 if not self.analyse_started:
401 if time.mktime(t) < time.mktime(cur_time):
402 return False
403 else:
404 self.analyse_started = True
405 if cur_time.tm_mon != t.tm_mon:
406 self._generateMonthStats()
407 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
408 elif cur_time.tm_mday != t.tm_mday:
409 self._generateDayStats()
410
411 self.meta_infos['last_time'] = t
412
413 if not self.meta_infos['start_analysis_time']:
414 self.meta_infos['start_analysis_time'] = t
415
416 if not self._decodeHTTPRequest(hit): return False
417
418 for k in hit.keys():
419 if hit[k] == '-' or hit[k] == '*':
420 hit[k] = ''
421
422 self._appendHit(hit)
423
424 return True
425
426 def start(self):
427 print '==> Load previous database'
428
429 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
430 if self.meta_infos['last_time']:
431 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
432 else:
433 self._clearVisits()
434
435 self.meta_infos['start_analysis_time'] = None
436
437 self.cache_plugins = preloadPlugins(self.plugins, self)
438
439 print '==> Analysing log'
440
441 with open(conf.analyzed_filename) as f:
442 for l in f:
443 # print "line " + l
444
445 groups = self.log_re.match(l)
446
447 if groups:
448 if not self._newHit(groups.groupdict()):
449 break
450 else:
451 print "No match for " + l
452 #break
453
454 if self.analyse_started:
455 self._generateDayStats()
456 self._generateMonthStats()
457 del self.meta_infos['start_analysis_time']
458 self._serialize(self.meta_infos, conf.META_PATH)
459 else:
460 print '==> Analyse not started : nothing to do'
461 self._generateMonthStats()
462
463if __name__ == '__main__':
464 iwla = IWLA()
465 iwla.start()

Archive Download this file

Branches

Tags