iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import shutil
5import sys
6import re
7import time
8import pickle
9import gzip
10import importlib
11import argparse
12from calendar import monthrange
13from datetime import date
14
15import default_conf as conf
16import conf as _
17conf.__dict__.update(_.__dict__)
18del _
19
20from iplugin import *
21from display import *
22
23#
24# Main class IWLA
25# Parse Log, compute them, call plugins and produce output
26# For now, only HTTP log are valid
27#
28# Plugin requirements :
29# None
30#
31# Conf values needed :
32# analyzed_filename
33# domain_name
34# compress_output_files*
35#
36# Output files :
37# DB_ROOT/meta.db
38# DB_ROOT/year/month/iwla.db
39# OUTPUT_ROOT/index.html
40# OUTPUT_ROOT/year/month/index.html
41#
42# Statistics creation :
43#
44# meta :
45# last_time
46# start_analysis_time
47# stats =>
48# year =>
49# month =>
50# viewed_bandwidth
51# not_viewed_bandwidth
52# viewed_pages
53# viewed_hits
54# nb_visits
55# nb_visitors
56#
57# month_stats :
58# viewed_bandwidth
59# not_viewed_bandwidth
60# viewed_pages
61# viewed_hits
62# nb_visits
63#
64# days_stats :
65# day =>
66# viewed_bandwidth
67# not_viewed_bandwidth
68# viewed_pages
69# viewed_hits
70# nb_visits
71# nb_visitors
72#
73# visits :
74# remote_addr =>
75# remote_addr
76# remote_ip
77# viewed_pages
78# viewed_hits
79# not_viewed_pages
80# not_viewed_hits
81# bandwidth
82# last_access
83# requests =>
84# [fields_from_format_log]
85# extract_request =>
86# extract_uri
87# extract_parameters*
88# extract_referer* =>
89# extract_uri
90# extract_parameters*
91# robot
92# hit_only
93# is_page
94#
95# valid_visitors:
96# month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
97#
98# Statistics update :
99# None
100#
101# Statistics deletion :
102# None
103#
104
105class IWLA(object):
106
107 ANALYSIS_CLASS = 'HTTP'
108 API_VERSION = 1
109 IWLA_VERSION = '0.1'
110
111 def __init__(self):
112 print '==> Start'
113
114 self.meta_infos = {}
115 self.analyse_started = False
116 self.current_analysis = {}
117 self.cache_plugins = {}
118 self.display = DisplayHTMLBuild(self)
119 self.valid_visitors = None
120
121 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
122 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
123 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
124 self.log_re = re.compile(self.log_format_extracted)
125 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
126 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
127 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
128 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
129
130 def getVersion(self):
131 return IWLA.IWLA_VERSION
132
133 def getConfValue(self, key, default=None):
134 if not key in dir(conf):
135 return default
136 else:
137 return conf.__dict__[key]
138
139 def _clearVisits(self):
140 self.current_analysis = {
141 'days_stats' : {},
142 'month_stats' : {},
143 'visits' : {}
144 }
145 self.valid_visitors = None
146 return self.current_analysis
147
148 def getDaysStats(self):
149 return self.current_analysis['days_stats']
150
151 def getMonthStats(self):
152 return self.current_analysis['month_stats']
153
154 def getCurrentVisists(self):
155 return self.current_analysis['visits']
156
157 def getValidVisitors(self):
158 return self.valid_visitors
159
160 def getDisplay(self):
161 return self.display
162
163 def getCurTime(self):
164 return self.meta_infos['last_time']
165
166 def getStartAnalysisTime(self):
167 return self.meta_infos['start_analysis_time']
168
169 def isValidForCurrentAnalysis(self, request):
170 cur_time = self.meta_infos['start_analysis_time']
171 # Analyse not started
172 if not cur_time: return False
173 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
174
175 def hasBeenViewed(self, request):
176 return int(request['status']) in conf.viewed_http_codes
177
178 def getCurDisplayPath(self, filename):
179 cur_time = self.meta_infos['last_time']
180 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
181
182 def getResourcesPath(self):
183 return conf.resources_path
184
185 def getCSSPath(self):
186 return conf.css_path
187
188 def _clearMeta(self):
189 self.meta_infos = {
190 'last_time' : None,
191 'start_analysis_time' : None
192 }
193 return self.meta_infos
194
195 def _clearDisplay(self):
196 self.display = DisplayHTMLBuild(self)
197return self.display
198
199 def getDBFilename(self, time):
200 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
201
202 def _serialize(self, obj, filename):
203 base = os.path.dirname(filename)
204 if not os.path.exists(base):
205 os.makedirs(base)
206
207 # TODO : remove return
208 #return
209
210 with open(filename + '.tmp', 'wb+') as f:
211 pickle.dump(obj, f)
212 f.seek(0)
213 with gzip.open(filename, 'w') as fzip:
214 fzip.write(f.read())
215 os.remove(filename + '.tmp')
216
217 def _deserialize(self, filename):
218 if not os.path.exists(filename):
219 return None
220
221 with gzip.open(filename, 'r') as f:
222 return pickle.load(f)
223 return None
224
225 def _callPlugins(self, target_root, *args):
226 print '==> Call plugins (%s)' % target_root
227 for (root, plugins) in self.plugins:
228 if root != target_root: continue
229 for p in plugins:
230 mod = self.cache_plugins.get(root + '.' + p, None)
231 if mod:
232 print '\t%s' % (p)
233 mod.hook(*args)
234
235 def isPage(self, request):
236 for e in conf.pages_extensions:
237 if request.endswith(e):
238 return True
239
240 return False
241
242 def _appendHit(self, hit):
243 remote_addr = hit['remote_addr']
244
245 if not remote_addr: return
246
247 if not remote_addr in self.current_analysis['visits'].keys():
248 self._createVisitor(hit)
249
250 super_hit = self.current_analysis['visits'][remote_addr]
251 super_hit['requests'].append(hit)
252 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
253 super_hit['last_access'] = self.meta_infos['last_time']
254
255 request = hit['extract_request']
256
257 uri = request.get('extract_uri', request['http_uri'])
258
259 hit['is_page'] = self.isPage(uri)
260
261 if super_hit['robot'] or\
262 not self.hasBeenViewed(hit):
263 page_key = 'not_viewed_pages'
264 hit_key = 'not_viewed_hits'
265 else:
266 page_key = 'viewed_pages'
267 hit_key = 'viewed_hits'
268
269 if hit['is_page']:
270 super_hit[page_key] += 1
271 else:
272 super_hit[hit_key] += 1
273
274 def _createVisitor(self, hit):
275 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
276 super_hit['remote_addr'] = hit['remote_addr']
277 super_hit['remote_ip'] = hit['remote_addr']
278 super_hit['viewed_pages'] = 0
279 super_hit['viewed_hits'] = 0
280 super_hit['not_viewed_pages'] = 0
281 super_hit['not_viewed_hits'] = 0
282 super_hit['bandwidth'] = 0
283 super_hit['last_access'] = self.meta_infos['last_time']
284 super_hit['requests'] = []
285 super_hit['robot'] = False
286 super_hit['hit_only'] = 0
287
288 def _decodeHTTPRequest(self, hit):
289 if not 'request' in hit.keys(): return False
290
291 groups = self.http_request_extracted.match(hit['request'])
292
293 if groups:
294 hit['extract_request'] = groups.groupdict()
295 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
296 if uri_groups:
297 d = uri_groups.groupdict()
298 hit['extract_request']['extract_uri'] = d['extract_uri']
299 if 'extract_parameters' in d.keys():
300 hit['extract_request']['extract_parameters'] = d['extract_parameters']
301 else:
302 print "Bad request extraction " + hit['request']
303 return False
304
305 if hit['http_referer']:
306 referer_groups = self.uri_re.match(hit['http_referer'])
307 if referer_groups:
308 hit['extract_referer'] = referer_groups.groupdict()
309 return True
310
311 def _decodeTime(self, hit):
312 try:
313 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
314 except ValueError, e:
315 if sys.version_info < (3, 2):
316 # Try without UTC value at the end (%z not recognized)
317 gmt_offset_str = hit['time_local'][-5:]
318 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
319 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
320 gmt_offset = gmt_offset_hours + gmt_offset_minutes
321 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
322 if gmt_offset_str[0] == '+':
323 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
324 else:
325 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
326 else:
327 raise e
328 return hit['time_decoded']
329
330 def getDisplayIndex(self):
331 cur_time = self.meta_infos['last_time']
332 filename = self.getCurDisplayPath('index.html')
333
334 return self.display.getPage(filename)
335
336 def _generateDisplayDaysStats(self):
337 cur_time = self.meta_infos['last_time']
338 title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
339 filename = self.getCurDisplayPath('index.html')
340 print '==> Generate display (%s)' % (filename)
341 page = self.display.createPage(title, filename, conf.css_path)
342
343 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
344 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, 'By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], None, nb_month_days, range(1,6))
345 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
346 nb_visits = 0
347 nb_days = 0
348 for i in range(1, nb_month_days+1):
349 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
350 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
351 if i in self.current_analysis['days_stats'].keys():
352 stats = self.current_analysis['days_stats'][i]
353 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
354 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
355 nb_visits += stats['nb_visits']
356 nb_days += 1
357 else:
358 row = [full_day, 0, 0, 0, 0, 0]
359 days.appendRow(row)
360 days.setCellValue(i-1, 4, bytesToStr(row[4]))
361 days.setCellValue(i-1, 5, bytesToStr(row[5]))
362 days.appendShortTitle(day)
363 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
364 week_day = adate.weekday()
365 if week_day == 5 or week_day == 6:
366 days.setRowCSSClass(i-1, 'iwla_weekend')
367 if adate == date.today():
368 css = days.getCellCSSClass(i-1, 0)
369 if css: css = '%s %s' % (css, 'iwla_curday')
370 else: css = 'iwla_curday'
371 days.setCellCSSClass(i-1, 0, css)
372
373 stats = self.current_analysis['month_stats']
374
375 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
376 if nb_days:
377 average_row = map(lambda(v): int(v/nb_days), row)
378 else:
379 average_row = map(lambda(v): 0, row)
380
381 average_row[0] = 'Average'
382 average_row[4] = bytesToStr(average_row[4])
383 average_row[5] = bytesToStr(average_row[5])
384 days.appendRow(average_row)
385
386 row[0] = 'Total'
387 row[4] = bytesToStr(row[4])
388 row[5] = bytesToStr(row[5])
389 days.appendRow(row)
390 page.appendBlock(days)
391 self.display.addPage(page)
392
393 def _generateDisplayMonthStats(self, page, year, month_stats):
394 cur_time = time.localtime()
395 months_name = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
396 title = 'Summary %d' % (year)
397 cols = ['Month', 'Visitors', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth', 'Details']
398 graph_cols=range(1,7)
399 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
400 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
401 total = [0] * len(cols)
402 for i in range(1, 13):
403 month = '%s<br/>%d' % (months_name[i], year)
404 full_month = '%s %d' % (months_name[i], year)
405 if i in month_stats.keys():
406 stats = month_stats[i]
407 link = '<a href="%d/%02d/index.html">Details</a>' % (year, i)
408 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
409 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
410 for j in graph_cols:
411 total[j] += row[j]
412 else:
413 row = [full_month, 0, 0, 0, 0, 0, 0, '']
414 months.appendRow(row)
415 months.setCellValue(i-1, 5, bytesToStr(row[5]))
416 months.setCellValue(i-1, 6, bytesToStr(row[6]))
417 months.appendShortTitle(month)
418 if year == cur_time.tm_year and i == cur_time.tm_mon:
419 css = months.getCellCSSClass(i-1, 0)
420 if css: css = '%s %s' % (css, 'iwla_curday')
421 else: css = 'iwla_curday'
422 months.setCellCSSClass(i-1, 0, css)
423
424 total[0] = 'Total'
425 total[5] = bytesToStr(total[5])
426 total[6] = bytesToStr(total[6])
427 months.appendRow(total)
428 page.appendBlock(months)
429
430 def _generateDisplayWholeMonthStats(self):
431 title = 'Stats for %s' % (conf.domain_name)
432 filename = 'index.html'
433 print '==> Generate main page (%s)' % (filename)
434
435 page = self.display.createPage(title, filename, conf.css_path)
436
437 last_update = '<b>Last update</b> %s<br />' % (time.strftime('%02d %b %Y %H:%M', time.localtime()))
438 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
439
440 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
441 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
442
443 self.display.addPage(page)
444
445 def _compressFile(self, build_time, root, filename):
446 path = os.path.join(root, filename)
447 gz_path = path + '.gz'
448 #print 'Compress %s => %s' % (path, gz_path)
449 if not os.path.exists(gz_path) or\
450 os.stat(path).st_mtime > build_time:
451 with open(path, 'rb') as f_in:
452 with gzip.open(gz_path, 'wb') as f_out:
453 f_out.write(f_in.read())
454
455 def _compressFiles(self, build_time, root):
456 if not conf.compress_output_files: return
457 for rootdir, subdirs, files in os.walk(root, followlinks=True):
458 for f in files:
459 for ext in conf.compress_output_files:
460 if f.endswith(ext):
461 self._compressFile(build_time, rootdir, f)
462 break
463
464 def _generateDisplay(self):
465 self._generateDisplayDaysStats()
466 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
467 self._generateDisplayWholeMonthStats()
468 build_time = time.localtime()
469 self.display.build(conf.DISPLAY_ROOT)
470 self._compressFiles(build_time, conf.DISPLAY_ROOT)
471
472 def _createEmptyStats(self):
473 stats = {}
474 stats['viewed_bandwidth'] = 0
475 stats['not_viewed_bandwidth'] = 0
476 stats['viewed_pages'] = 0
477 stats['viewed_hits'] = 0
478 stats['nb_visits'] = 0
479
480 return stats
481
482 def _generateMonthStats(self):
483 self._clearDisplay()
484
485 visits = self.current_analysis['visits']
486
487 stats = self._createEmptyStats()
488 for (day, stat) in self.current_analysis['days_stats'].items():
489 for k in stats.keys():
490 stats[k] += stat[k]
491
492 duplicated_stats = {k:v for (k,v) in stats.items()}
493
494 cur_time = self.meta_infos['last_time']
495 print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
496 print stats
497
498 if not 'month_stats' in self.current_analysis.keys():
499 self.current_analysis['month_stats'] = stats
500 else:
501 for (k,v) in stats.items():
502 self.current_analysis['month_stats'][k] = v
503
504 self.valid_visitors = {}
505 for (k,v) in visits.items():
506 if v['robot']: continue
507 if conf.count_hit_only_visitors and\
508 (not v['viewed_pages']):
509 continue
510 self.valid_visitors[k] = v
511
512 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
513
514 self._callPlugins(conf.POST_HOOK_DIRECTORY)
515
516 path = self.getDBFilename(cur_time)
517 if os.path.exists(path):
518 os.remove(path)
519
520 print "==> Serialize to %s" % path
521 self._serialize(self.current_analysis, path)
522
523 # Save month stats
524 year = cur_time.tm_year
525 month = cur_time.tm_mon
526 if not 'stats' in self.meta_infos.keys():
527 self.meta_infos['stats'] = {}
528 if not year in self.meta_infos['stats'].keys():
529 self.meta_infos['stats'][year] = {}
530 self.meta_infos['stats'][year][month] = duplicated_stats
531
532 self._generateDisplay()
533
534 def _generateDayStats(self):
535 visits = self.current_analysis['visits']
536 cur_time = self.meta_infos['last_time']
537
538 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
539
540 stats = self._createEmptyStats()
541
542 for (k, super_hit) in visits.items():
543 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
544 continue
545 viewed_page = False
546 for hit in super_hit['requests'][::-1]:
547 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
548 break
549 if super_hit['robot'] or\
550 not self.hasBeenViewed(hit):
551 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
552 continue
553 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
554 if hit['is_page']:
555 stats['viewed_pages'] += 1
556 viewed_pages = True
557 else:
558 stats['viewed_hits'] += 1
559 if (conf.count_hit_only_visitors or\
560 viewed_pages) and\
561 not super_hit['robot']:
562 stats['nb_visits'] += 1
563
564 print "== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
565
566 print stats
567
568 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
569
570 def _newHit(self, hit):
571 t = self._decodeTime(hit)
572
573 cur_time = self.meta_infos['last_time']
574
575 if cur_time == None:
576 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
577 self.analyse_started = True
578 else:
579 if time.mktime(t) <= time.mktime(cur_time):
580 return False
581 self.analyse_started = True
582 if cur_time.tm_mon != t.tm_mon:
583 self._generateMonthStats()
584 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
585 elif cur_time.tm_mday != t.tm_mday:
586 self._generateDayStats()
587
588 self.meta_infos['last_time'] = t
589
590 if not self.meta_infos['start_analysis_time']:
591 self.meta_infos['start_analysis_time'] = t
592
593 if not self._decodeHTTPRequest(hit): return False
594
595 for k in hit.keys():
596 if hit[k] == '-' or hit[k] == '*':
597 hit[k] = ''
598
599 self._appendHit(hit)
600
601 return True
602
603 def start(self, _file):
604 print '==> Load previous database'
605
606 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
607 if self.meta_infos['last_time']:
608 print 'Last time'
609 print self.meta_infos['last_time']
610 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
611 else:
612 self._clearVisits()
613
614 self.meta_infos['start_analysis_time'] = None
615
616 self.cache_plugins = preloadPlugins(self.plugins, self)
617
618 print '==> Analysing log'
619
620 for l in _file:
621 # print "line " + l
622
623 groups = self.log_re.match(l)
624
625 if groups:
626 if not self._newHit(groups.groupdict()):
627 continue
628 else:
629 print "No match for " + l
630 #break
631
632 if self.analyse_started:
633 self._generateDayStats()
634 self._generateMonthStats()
635 del self.meta_infos['start_analysis_time']
636 self._serialize(self.meta_infos, conf.META_PATH)
637 else:
638 print '==> Analyse not started : nothing new'
639
640if __name__ == '__main__':
641 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
642
643 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
644 default=False,
645 help='Clean output before starting')
646
647 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
648 default=False,
649 help='Read data from stdin instead of conf.analyzed_filename')
650
651 parser.add_argument('-f', '--file', dest='file',
652 help='Analyse this log file')
653
654 args = parser.parse_args()
655
656 if args.clean_output:
657 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
658 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
659
660 iwla = IWLA()
661
662 required_conf = ['analyzed_filename', 'domain_name']
663 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
664 sys.exit(0)
665
666 if args.stdin:
667 iwla.start(sys.stdin)
668 else:
669 filename = args.file or conf.analyzed_filename
670 if not os.path.exists(filename):
671 print 'No such file \'%s\'' % (filename)
672 sys.exit(-1)
673 with open(filename) as f:
674 iwla.start(f)

Archive Download this file

Branches

Tags