iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import shutil
5import sys
6import re
7import time
8import pickle
9import gzip
10import importlib
11import argparse
12from calendar import monthrange
13from datetime import date
14
15import default_conf as conf
16import conf as _
17conf.__dict__.update(_.__dict__)
18del _
19
20from iplugin import *
21from display import *
22
23#
24# Main class IWLA
25# Parse Log, compute them, call plugins and produce output
26# For now, only HTTP log are valid
27#
28# Plugin requirements :
29# None
30#
31# Conf values needed :
32# analyzed_filename
33# domain_name
34#
35# Output files :
36# DB_ROOT/meta.db
37# DB_ROOT/year/month/iwla.db
38# OUTPUT_ROOT/index.html
39# OUTPUT_ROOT/year/month/index.html
40#
41# Statistics creation :
42#
43# meta =>
44# last_time
45# start_analysis_time
46# stats =>
47# year =>
48# month =>
49# viewed_bandwidth
50# not_viewed_bandwidth
51# viewed_pages
52# viewed_hits
53# nb_visits
54# nb_visitors
55#
56# month_stats :
57# viewed_bandwidth
58# not_viewed_bandwidth
59# viewed_pages
60# viewed_hits
61# nb_visits
62#
63# days_stats :
64# day =>
65# viewed_bandwidth
66# not_viewed_bandwidth
67# viewed_pages
68# viewed_hits
69# nb_visits
70# nb_visitors
71#
72# visits :
73# remote_addr =>
74# remote_addr
75# remote_ip
76# viewed_pages
77# viewed_hits
78# not_viewed_pages
79# not_viewed_hits
80# bandwidth
81# last_access
82# requests =>
83# [fields_from_format_log]
84# extract_request =>
85# extract_uri
86# extract_parameters*
87# extract_referer* =>
88# extract_uri
89# extract_parameters*
90# robot
91# hit_only
92# is_page
93#
94# valid_visitors:
95# month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
96#
97# Statistics update :
98# None
99#
100# Statistics deletion :
101# None
102#
103
104class IWLA(object):
105
106 ANALYSIS_CLASS = 'HTTP'
107 API_VERSION = 1
108 IWLA_VERSION = '0.1'
109
110 def __init__(self):
111 print '==> Start'
112
113 self.meta_infos = {}
114 self.analyse_started = False
115 self.current_analysis = {}
116 self.cache_plugins = {}
117 self.display = DisplayHTMLBuild(self)
118 self.valid_visitors = None
119
120 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
121 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
122 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
123 self.log_re = re.compile(self.log_format_extracted)
124 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
125 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
126 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
127 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
128
129 def getVersion(self):
130 return IWLA.IWLA_VERSION
131
132 def getConfValue(self, key, default=None):
133 if not key in dir(conf):
134 return default
135 else:
136 return conf.__dict__[key]
137
138 def _clearVisits(self):
139 self.current_analysis = {
140 'days_stats' : {},
141 'month_stats' : {},
142 'visits' : {}
143 }
144 self.valid_visitors = None
145 return self.current_analysis
146
147 def getDaysStats(self):
148 return self.current_analysis['days_stats']
149
150 def getMonthStats(self):
151 return self.current_analysis['month_stats']
152
153 def getCurrentVisists(self):
154 return self.current_analysis['visits']
155
156 def getValidVisitors(self):
157 return self.valid_visitors
158
159 def getDisplay(self):
160 return self.display
161
162 def getCurTime(self):
163 return self.meta_infos['last_time']
164
165 def getStartAnalysisTime(self):
166 return self.meta_infos['start_analysis_time']
167
168 def isValidForCurrentAnalysis(self, request):
169 cur_time = self.meta_infos['start_analysis_time']
170 # Analyse not started
171 if not cur_time: return False
172 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
173
174 def hasBeenViewed(self, request):
175 return int(request['status']) in conf.viewed_http_codes
176
177 def getCurDisplayPath(self, filename):
178 cur_time = self.meta_infos['last_time']
179 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
180
181 def getResourcesPath(self):
182 return conf.resources_path
183
184 def getCSSPath(self):
185 return conf.css_path
186
187 def _clearMeta(self):
188 self.meta_infos = {
189 'last_time' : None,
190 'start_analysis_time' : None
191 }
192 return self.meta_infos
193
194 def _clearDisplay(self):
195 self.display = DisplayHTMLBuild(self)
196return self.display
197
198 def getDBFilename(self, time):
199 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
200
201 def _serialize(self, obj, filename):
202 base = os.path.dirname(filename)
203 if not os.path.exists(base):
204 os.makedirs(base)
205
206 # TODO : remove return
207 #return
208
209 with open(filename + '.tmp', 'wb+') as f:
210 pickle.dump(obj, f)
211 f.seek(0)
212 with gzip.open(filename, 'w') as fzip:
213 fzip.write(f.read())
214 os.remove(filename + '.tmp')
215
216 def _deserialize(self, filename):
217 if not os.path.exists(filename):
218 return None
219
220 with gzip.open(filename, 'r') as f:
221 return pickle.load(f)
222 return None
223
224 def _callPlugins(self, target_root, *args):
225 print '==> Call plugins (%s)' % target_root
226 for (root, plugins) in self.plugins:
227 if root != target_root: continue
228 for p in plugins:
229 mod = self.cache_plugins.get(root + '.' + p, None)
230 if mod:
231 print '\t%s' % (p)
232 mod.hook(*args)
233
234 def isPage(self, request):
235 for e in conf.pages_extensions:
236 if request.endswith(e):
237 return True
238
239 return False
240
241 def _appendHit(self, hit):
242 remote_addr = hit['remote_addr']
243
244 if not remote_addr: return
245
246 if not remote_addr in self.current_analysis['visits'].keys():
247 self._createVisitor(hit)
248
249 super_hit = self.current_analysis['visits'][remote_addr]
250 super_hit['requests'].append(hit)
251 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
252 super_hit['last_access'] = self.meta_infos['last_time']
253
254 request = hit['extract_request']
255
256 uri = request.get('extract_uri', request['http_uri'])
257
258 hit['is_page'] = self.isPage(uri)
259
260 if super_hit['robot'] or\
261 not self.hasBeenViewed(hit):
262 page_key = 'not_viewed_pages'
263 hit_key = 'not_viewed_hits'
264 else:
265 page_key = 'viewed_pages'
266 hit_key = 'viewed_hits'
267
268 if hit['is_page']:
269 super_hit[page_key] += 1
270 else:
271 super_hit[hit_key] += 1
272
273 def _createVisitor(self, hit):
274 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
275 super_hit['remote_addr'] = hit['remote_addr']
276 super_hit['remote_ip'] = hit['remote_addr']
277 super_hit['viewed_pages'] = 0
278 super_hit['viewed_hits'] = 0
279 super_hit['not_viewed_pages'] = 0
280 super_hit['not_viewed_hits'] = 0
281 super_hit['bandwidth'] = 0
282 super_hit['last_access'] = self.meta_infos['last_time']
283 super_hit['requests'] = []
284 super_hit['robot'] = False
285 super_hit['hit_only'] = 0
286
287 def _decodeHTTPRequest(self, hit):
288 if not 'request' in hit.keys(): return False
289
290 groups = self.http_request_extracted.match(hit['request'])
291
292 if groups:
293 hit['extract_request'] = groups.groupdict()
294 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
295 if uri_groups:
296 d = uri_groups.groupdict()
297 hit['extract_request']['extract_uri'] = d['extract_uri']
298 if 'extract_parameters' in d.keys():
299 hit['extract_request']['extract_parameters'] = d['extract_parameters']
300 else:
301 print "Bad request extraction " + hit['request']
302 return False
303
304 if hit['http_referer']:
305 referer_groups = self.uri_re.match(hit['http_referer'])
306 if referer_groups:
307 hit['extract_referer'] = referer_groups.groupdict()
308 return True
309
310 def _decodeTime(self, hit):
311 try:
312 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
313 except ValueError, e:
314 if sys.version_info < (3, 2):
315 # Try without UTC value at the end (%z not recognized)
316 gmt_offset_str = hit['time_local'][-5:]
317 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
318 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
319 gmt_offset = gmt_offset_hours + gmt_offset_minutes
320 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
321 if gmt_offset_str[0] == '+':
322 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
323 else:
324 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
325 else:
326 raise e
327 return hit['time_decoded']
328
329 def getDisplayIndex(self):
330 cur_time = self.meta_infos['last_time']
331 filename = self.getCurDisplayPath('index.html')
332
333 return self.display.getPage(filename)
334
335 def _generateDisplayDaysStats(self):
336 cur_time = self.meta_infos['last_time']
337 title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
338 filename = self.getCurDisplayPath('index.html')
339 print '==> Generate display (%s)' % (filename)
340 page = self.display.createPage(title, filename, conf.css_path)
341
342 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
343 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, 'By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], None, nb_month_days, range(1,6))
344 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
345 nb_visits = 0
346 nb_days = 0
347 for i in range(1, nb_month_days+1):
348 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
349 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
350 if i in self.current_analysis['days_stats'].keys():
351 stats = self.current_analysis['days_stats'][i]
352 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
353 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
354 nb_visits += stats['nb_visits']
355 nb_days += 1
356 else:
357 row = [full_day, 0, 0, 0, 0, 0]
358 days.appendRow(row)
359 days.setCellValue(i-1, 4, bytesToStr(row[4]))
360 days.setCellValue(i-1, 5, bytesToStr(row[5]))
361 days.appendShortTitle(day)
362 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
363 week_day = adate.weekday()
364 if week_day == 5 or week_day == 6:
365 days.setRowCSSClass(i-1, 'iwla_weekend')
366 if adate == date.today():
367 css = days.getCellCSSClass(i-1, 0)
368 if css: css = '%s %s' % (css, 'iwla_curday')
369 else: css = 'iwla_curday'
370 days.setCellCSSClass(i-1, 0, css)
371
372 stats = self.current_analysis['month_stats']
373
374 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
375 if nb_days:
376 average_row = map(lambda(v): int(v/nb_days), row)
377 else:
378 average_row = map(lambda(v): 0, row)
379
380 average_row[0] = 'Average'
381 average_row[4] = bytesToStr(average_row[4])
382 average_row[5] = bytesToStr(average_row[5])
383 days.appendRow(average_row)
384
385 row[0] = 'Total'
386 row[4] = bytesToStr(row[4])
387 row[5] = bytesToStr(row[5])
388 days.appendRow(row)
389 page.appendBlock(days)
390 self.display.addPage(page)
391
392 def _generateDisplayMonthStats(self, page, year, month_stats):
393 cur_time = time.localtime()
394 months_name = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
395 title = 'Summary %d' % (year)
396 cols = ['Month', 'Visitors', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth', 'Details']
397 graph_cols=range(1,7)
398 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
399 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
400 total = [0] * len(cols)
401 for i in range(1, 13):
402 month = '%s<br/>%d' % (months_name[i], year)
403 full_month = '%s %d' % (months_name[i], year)
404 if i in month_stats.keys():
405 stats = month_stats[i]
406 link = '<a href="%d/%02d/index.html">Details</a>' % (year, i)
407 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
408 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
409 for j in graph_cols:
410 total[j] += row[j]
411 else:
412 row = [full_month, 0, 0, 0, 0, 0, 0, '']
413 months.appendRow(row)
414 months.setCellValue(i-1, 5, bytesToStr(row[5]))
415 months.setCellValue(i-1, 6, bytesToStr(row[6]))
416 months.appendShortTitle(month)
417 if year == cur_time.tm_year and i == cur_time.tm_mon:
418 css = months.getCellCSSClass(i-1, 0)
419 if css: css = '%s %s' % (css, 'iwla_curday')
420 else: css = 'iwla_curday'
421 months.setCellCSSClass(i-1, 0, css)
422
423 total[0] = 'Total'
424 total[5] = bytesToStr(total[5])
425 total[6] = bytesToStr(total[6])
426 months.appendRow(total)
427 page.appendBlock(months)
428
429 def _generateDisplayWholeMonthStats(self):
430 title = 'Stats for %s' % (conf.domain_name)
431 filename = 'index.html'
432 print '==> Generate main page (%s)' % (filename)
433
434 page = self.display.createPage(title, filename, conf.css_path)
435
436 last_update = '<b>Last update</b> %s<br />' % (time.strftime('%02d %b %Y %H:%M', time.localtime()))
437 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
438
439 for year in self.meta_infos['stats'].keys():
440 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
441
442 self.display.addPage(page)
443
444 def _generateDisplay(self):
445 self._generateDisplayDaysStats()
446 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
447 self._generateDisplayWholeMonthStats()
448 self.display.build(conf.DISPLAY_ROOT)
449
450 def _createEmptyStats(self):
451 stats = {}
452 stats['viewed_bandwidth'] = 0
453 stats['not_viewed_bandwidth'] = 0
454 stats['viewed_pages'] = 0
455 stats['viewed_hits'] = 0
456 stats['nb_visits'] = 0
457
458 return stats
459
460 def _generateMonthStats(self):
461 self._clearDisplay()
462
463 visits = self.current_analysis['visits']
464
465 stats = self._createEmptyStats()
466 for (day, stat) in self.current_analysis['days_stats'].items():
467 for k in stats.keys():
468 stats[k] += stat[k]
469
470 duplicated_stats = {k:v for (k,v) in stats.items()}
471
472 cur_time = self.meta_infos['last_time']
473 print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
474 print stats
475
476 if not 'month_stats' in self.current_analysis.keys():
477 self.current_analysis['month_stats'] = stats
478 else:
479 for (k,v) in stats.items():
480 self.current_analysis['month_stats'][k] = v
481
482 self.valid_visitors = {}
483 for (k,v) in visits.items():
484 if v['robot']: continue
485 if conf.count_hit_only_visitors and\
486 (not v['viewed_pages']):
487 continue
488 self.valid_visitors[k] = v
489
490 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
491
492 self._callPlugins(conf.POST_HOOK_DIRECTORY)
493
494 path = self.getDBFilename(cur_time)
495 if os.path.exists(path):
496 os.remove(path)
497
498 print "==> Serialize to %s" % path
499 self._serialize(self.current_analysis, path)
500
501 # Save month stats
502 year = cur_time.tm_year
503 month = cur_time.tm_mon
504 if not 'stats' in self.meta_infos.keys():
505 self.meta_infos['stats'] = {}
506 if not year in self.meta_infos['stats'].keys():
507 self.meta_infos['stats'][year] = {}
508 self.meta_infos['stats'][year][month] = duplicated_stats
509
510 self._generateDisplay()
511
512 def _generateDayStats(self):
513 visits = self.current_analysis['visits']
514 cur_time = self.meta_infos['last_time']
515
516 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
517
518 stats = self._createEmptyStats()
519
520 for (k, super_hit) in visits.items():
521 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
522 continue
523 viewed_page = False
524 for hit in super_hit['requests'][::-1]:
525 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
526 break
527 if super_hit['robot'] or\
528 not self.hasBeenViewed(hit):
529 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
530 continue
531 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
532 if hit['is_page']:
533 stats['viewed_pages'] += 1
534 viewed_pages = True
535 else:
536 stats['viewed_hits'] += 1
537 if (conf.count_hit_only_visitors or\
538 viewed_pages) and\
539 not super_hit['robot']:
540 stats['nb_visits'] += 1
541
542 print "== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
543
544 print stats
545
546 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
547
548 def _newHit(self, hit):
549 t = self._decodeTime(hit)
550
551 cur_time = self.meta_infos['last_time']
552
553 if cur_time == None:
554 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
555 self.analyse_started = True
556 else:
557 if time.mktime(t) <= time.mktime(cur_time):
558 return False
559 self.analyse_started = True
560 if cur_time.tm_mon != t.tm_mon:
561 self._generateMonthStats()
562 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
563 elif cur_time.tm_mday != t.tm_mday:
564 self._generateDayStats()
565
566 self.meta_infos['last_time'] = t
567
568 if not self.meta_infos['start_analysis_time']:
569 self.meta_infos['start_analysis_time'] = t
570
571 if not self._decodeHTTPRequest(hit): return False
572
573 for k in hit.keys():
574 if hit[k] == '-' or hit[k] == '*':
575 hit[k] = ''
576
577 self._appendHit(hit)
578
579 return True
580
581 def start(self, _file):
582 print '==> Load previous database'
583
584 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
585 if self.meta_infos['last_time']:
586 print 'Last time'
587 print self.meta_infos['last_time']
588 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
589 else:
590 self._clearVisits()
591
592 self.meta_infos['start_analysis_time'] = None
593
594 self.cache_plugins = preloadPlugins(self.plugins, self)
595
596 print '==> Analysing log'
597
598 for l in _file:
599 # print "line " + l
600
601 groups = self.log_re.match(l)
602
603 if groups:
604 if not self._newHit(groups.groupdict()):
605 continue
606 else:
607 print "No match for " + l
608 #break
609
610 if self.analyse_started:
611 self._generateDayStats()
612 self._generateMonthStats()
613 del self.meta_infos['start_analysis_time']
614 self._serialize(self.meta_infos, conf.META_PATH)
615 else:
616 print '==> Analyse not started : nothing new'
617
618if __name__ == '__main__':
619 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
620
621 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
622 default=False,
623 help='Clean output before starting')
624
625 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
626 default=False,
627 help='Read data from stdin instead of conf.analyzed_filename')
628
629 parser.add_argument('-f', '--file', dest='file',
630 help='Analyse this log file')
631
632 args = parser.parse_args()
633
634 if args.clean_output:
635 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
636 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
637
638 iwla = IWLA()
639
640 required_conf = ['analyzed_filename', 'domain_name']
641 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
642 sys.exit(0)
643
644 if args.stdin:
645 iwla.start(sys.stdin)
646 else:
647 filename = args.file or conf.analyzed_filename
648 if not os.path.exists(filename):
649 print 'No such file \'%s\'' % (filename)
650 sys.exit(-1)
651 with open(filename) as f:
652 iwla.start(f)

Archive Download this file

Branches

Tags