iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import shutil
5import sys
6import re
7import time
8import pickle
9import gzip
10import importlib
11import argparse
12from calendar import monthrange
13from datetime import date
14
15import default_conf as conf
16import conf as _
17conf.__dict__.update(_.__dict__)
18del _
19
20from iplugin import *
21from display import *
22
23#
24# Main class IWLA
25# Parse Log, compute them, call plugins and produce output
26# For now, only HTTP log are valid
27#
28# Plugin requirements :
29# None
30#
31# Conf values needed :
32# analyzed_filename
33# domain_name
34#
35# Output files :
36# DB_ROOT/meta.db
37# DB_ROOT/year/month/iwla.db
38# OUTPUT_ROOT/index.html
39# OUTPUT_ROOT/year/month/index.html
40#
41# Statistics creation :
42#
43# meta =>
44# last_time
45# start_analysis_time
46# stats =>
47# year =>
48# month =>
49# viewed_bandwidth
50# not_viewed_bandwidth
51# viewed_pages
52# viewed_hits
53# nb_visitors
54#
55# month_stats :
56# viewed_bandwidth
57# not_viewed_bandwidth
58# viewed_pages
59# viewed_hits
60# nb_visitors
61#
62# days_stats :
63# day =>
64# viewed_bandwidth
65# not_viewed_bandwidth
66# viewed_pages
67# viewed_hits
68# nb_visitors
69#
70# visits :
71# remote_addr =>
72# remote_addr
73# remote_ip
74# viewed_pages
75# viewed_hits
76# not_viewed_pages
77# not_viewed_hits
78# bandwidth
79# last_access
80# requests =>
81# [fields_from_format_log]
82# extract_request =>
83# extract_uri
84# extract_parameters*
85# extract_referer* =>
86# extract_uri
87# extract_parameters*
88# robot
89# hit_only
90# is_page
91#
92# valid_visitors:
93# month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
94#
95# Statistics update :
96# None
97#
98# Statistics deletion :
99# None
100#
101
102class IWLA(object):
103
104 ANALYSIS_CLASS = 'HTTP'
105 API_VERSION = 1
106 IWLA_VERSION = '0.1'
107
108 def __init__(self):
109 print '==> Start'
110
111 self.meta_infos = {}
112 self.analyse_started = False
113 self.current_analysis = {}
114 self.cache_plugins = {}
115 self.display = DisplayHTMLBuild(self)
116 self.valid_visitors = None
117
118 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
119 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
120 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
121 self.log_re = re.compile(self.log_format_extracted)
122 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
123 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
124 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
125 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
126
127 def getVersion(self):
128 return IWLA.IWLA_VERSION
129
130 def getConfValue(self, key, default=None):
131 if not key in dir(conf):
132 return default
133 else:
134 return conf.__dict__[key]
135
136 def _clearVisits(self):
137 self.current_analysis = {
138 'days_stats' : {},
139 'month_stats' : {},
140 'visits' : {}
141 }
142 self.valid_visitors = None
143 return self.current_analysis
144
145 def getDaysStats(self):
146 return self.current_analysis['days_stats']
147
148 def getMonthStats(self):
149 return self.current_analysis['month_stats']
150
151 def getCurrentVisists(self):
152 return self.current_analysis['visits']
153
154 def getValidVisitors(self):
155 return self.valid_visitors
156
157 def getDisplay(self):
158 return self.display
159
160 def getCurTime(self):
161 return self.meta_infos['last_time']
162
163 def getStartAnalysisTime(self):
164 return self.meta_infos['start_analysis_time']
165
166 def isValidForCurrentAnalysis(self, request):
167 cur_time = self.meta_infos['start_analysis_time']
168 # Analyse not started
169 if not cur_time: return False
170 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
171
172 def hasBeenViewed(self, request):
173 return int(request['status']) in conf.viewed_http_codes
174
175 def getCurDisplayPath(self, filename):
176 cur_time = self.meta_infos['last_time']
177 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
178
179 def getResourcesPath(self):
180 return conf.resources_path
181
182 def getCSSPath(self):
183 return conf.css_path
184
185 def _clearMeta(self):
186 self.meta_infos = {
187 'last_time' : None,
188 'start_analysis_time' : None
189 }
190 return self.meta_infos
191
192 def _clearDisplay(self):
193 self.display = DisplayHTMLBuild(self)
194return self.display
195
196 def getDBFilename(self, time):
197 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
198
199 def _serialize(self, obj, filename):
200 base = os.path.dirname(filename)
201 if not os.path.exists(base):
202 os.makedirs(base)
203
204 # TODO : remove return
205 #return
206
207 with open(filename + '.tmp', 'wb+') as f:
208 pickle.dump(obj, f)
209 f.seek(0)
210 with gzip.open(filename, 'w') as fzip:
211 fzip.write(f.read())
212 os.remove(filename + '.tmp')
213
214 def _deserialize(self, filename):
215 if not os.path.exists(filename):
216 return None
217
218 with gzip.open(filename, 'r') as f:
219 return pickle.load(f)
220 return None
221
222 def _callPlugins(self, target_root, *args):
223 print '==> Call plugins (%s)' % target_root
224 for (root, plugins) in self.plugins:
225 if root != target_root: continue
226 for p in plugins:
227 mod = self.cache_plugins.get(root + '.' + p, None)
228 if mod:
229 print '\t%s' % (p)
230 mod.hook(*args)
231
232 def isPage(self, request):
233 for e in conf.pages_extensions:
234 if request.endswith(e):
235 return True
236
237 return False
238
239 def _appendHit(self, hit):
240 remote_addr = hit['remote_addr']
241
242 if not remote_addr: return
243
244 if not remote_addr in self.current_analysis['visits'].keys():
245 self._createVisitor(hit)
246
247 super_hit = self.current_analysis['visits'][remote_addr]
248 super_hit['requests'].append(hit)
249 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
250 super_hit['last_access'] = self.meta_infos['last_time']
251
252 request = hit['extract_request']
253
254 uri = request.get('extract_uri', request['http_uri'])
255
256 hit['is_page'] = self.isPage(uri)
257
258 if super_hit['robot'] or\
259 not int(hit['status']) in conf.viewed_http_codes:
260 page_key = 'not_viewed_pages'
261 hit_key = 'not_viewed_hits'
262 else:
263 page_key = 'viewed_pages'
264 hit_key = 'viewed_hits'
265
266 if hit['is_page']:
267 super_hit[page_key] += 1
268 else:
269 super_hit[hit_key] += 1
270
271 def _createVisitor(self, hit):
272 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
273 super_hit['remote_addr'] = hit['remote_addr']
274 super_hit['remote_ip'] = hit['remote_addr']
275 super_hit['viewed_pages'] = 0
276 super_hit['viewed_hits'] = 0
277 super_hit['not_viewed_pages'] = 0
278 super_hit['not_viewed_hits'] = 0
279 super_hit['bandwidth'] = 0
280 super_hit['last_access'] = self.meta_infos['last_time']
281 super_hit['requests'] = []
282 super_hit['robot'] = False
283 super_hit['hit_only'] = 0
284
285 def _decodeHTTPRequest(self, hit):
286 if not 'request' in hit.keys(): return False
287
288 groups = self.http_request_extracted.match(hit['request'])
289
290 if groups:
291 hit['extract_request'] = groups.groupdict()
292 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
293 if uri_groups:
294 d = uri_groups.groupdict()
295 hit['extract_request']['extract_uri'] = d['extract_uri']
296 if 'extract_parameters' in d.keys():
297 hit['extract_request']['extract_parameters'] = d['extract_parameters']
298 else:
299 print "Bad request extraction " + hit['request']
300 return False
301
302 if hit['http_referer']:
303 referer_groups = self.uri_re.match(hit['http_referer'])
304 if referer_groups:
305 hit['extract_referer'] = referer_groups.groupdict()
306 return True
307
308 def _decodeTime(self, hit):
309 try:
310 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
311 except ValueError, e:
312 if sys.version_info < (3, 2):
313 # Try without UTC value at the end (%z not recognized)
314 gmt_offset_str = hit['time_local'][-5:]
315 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
316 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
317 gmt_offset = gmt_offset_hours + gmt_offset_minutes
318 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
319 if gmt_offset_str[0] == '+':
320 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
321 else:
322 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
323 else:
324 raise e
325 return hit['time_decoded']
326
327 def getDisplayIndex(self):
328 cur_time = self.meta_infos['last_time']
329 filename = self.getCurDisplayPath('index.html')
330
331 return self.display.getPage(filename)
332
333 def _generateDisplayDaysStats(self):
334 cur_time = self.meta_infos['last_time']
335 title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
336 filename = self.getCurDisplayPath('index.html')
337 print '==> Generate display (%s)' % (filename)
338 page = self.display.createPage(title, filename, conf.css_path)
339
340 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
341 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, 'By day', ['Day', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], None, nb_month_days, range(1,6))
342 days.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
343 nb_visits = 0
344 nb_days = 0
345 for i in range(1, nb_month_days+1):
346 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
347 full_day = '%d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
348 if i in self.current_analysis['days_stats'].keys():
349 stats = self.current_analysis['days_stats'][i]
350 row = [full_day, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
351 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
352 nb_visits += stats['nb_visitors']
353 nb_days += 1
354 else:
355 row = [full_day, 0, 0, 0, 0, 0]
356 days.appendRow(row)
357 days.setCellValue(i-1, 4, bytesToStr(row[4]))
358 days.setCellValue(i-1, 5, bytesToStr(row[5]))
359 days.appendShortTitle(day)
360 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
361 week_day = adate.weekday()
362 if week_day == 5 or week_day == 6:
363 days.setRowCSSClass(i-1, 'iwla_weekend')
364 if adate == date.today():
365 css = days.getCellCSSClass(i-1, 0)
366 if css: css = '%s %s' % (css, 'iwla_curday')
367 else: css = 'iwla_curday'
368 days.setCellCSSClass(i-1, 0, css)
369
370 stats = self.current_analysis['month_stats']
371
372 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
373 if nb_days:
374 average_row = map(lambda(v): int(v/nb_days), row)
375 else:
376 average_row = map(lambda(v): 0, row)
377
378 average_row[0] = 'Average'
379 average_row[4] = bytesToStr(average_row[4])
380 average_row[5] = bytesToStr(average_row[5])
381 days.appendRow(average_row)
382
383 row[0] = 'Total'
384 row[4] = bytesToStr(row[4])
385 row[5] = bytesToStr(row[5])
386 days.appendRow(row)
387 page.appendBlock(days)
388 self.display.addPage(page)
389
390 def _generateDisplayMonthStats(self, page, year, month_stats):
391 cur_time = time.localtime()
392 months_name = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
393 title = 'Summary %d' % (year)
394 cols = ['Month', 'Visitors', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth', 'Details']
395 graph_cols=range(1,6)
396 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
397 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
398 total = [0] * len(cols)
399 for i in range(1, 13):
400 month = '%s<br/>%d' % (months_name[i], year)
401 full_month = '%s %d' % (months_name[i], year)
402 if i in month_stats.keys():
403 stats = month_stats[i]
404 link = '<a href="%d/%02d/index.html">Details</a>' % (year, i)
405 row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
406 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
407 for j in graph_cols:
408 total[j] += row[j]
409 else:
410 row = [full_month, 0, 0, 0, 0, 0, '']
411 months.appendRow(row)
412 months.setCellValue(i-1, 4, bytesToStr(row[4]))
413 months.setCellValue(i-1, 5, bytesToStr(row[5]))
414 months.appendShortTitle(month)
415 if year == cur_time.tm_year and i == cur_time.tm_mon:
416 css = months.getCellCSSClass(i-1, 0)
417 if css: css = '%s %s' % (css, 'iwla_curday')
418 else: css = 'iwla_curday'
419 months.setCellCSSClass(i-1, 0, css)
420
421 total[0] = 'Total'
422 total[4] = bytesToStr(total[4])
423 total[5] = bytesToStr(total[5])
424 months.appendRow(total)
425 page.appendBlock(months)
426
427 def _generateDisplayWholeMonthStats(self):
428 title = 'Stats for %s' % (conf.domain_name)
429 filename = 'index.html'
430 print '==> Generate main page (%s)' % (filename)
431
432 page = self.display.createPage(title, filename, conf.css_path)
433
434 last_update = '<b>Last update</b> %s<br />' % (time.strftime('%d %b %Y %H:%M', time.localtime()))
435 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
436
437 for year in self.meta_infos['stats'].keys():
438 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
439
440 self.display.addPage(page)
441
442 def _generateDisplay(self):
443 self._generateDisplayDaysStats()
444 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
445 self._generateDisplayWholeMonthStats()
446 self.display.build(conf.DISPLAY_ROOT)
447
448 def _createEmptyStats(self):
449 stats = {}
450 stats['viewed_bandwidth'] = 0
451 stats['not_viewed_bandwidth'] = 0
452 stats['viewed_pages'] = 0
453 stats['viewed_hits'] = 0
454 stats['nb_visitors'] = 0
455
456 return stats
457
458 def _generateMonthStats(self):
459 self._clearDisplay()
460
461 visits = self.current_analysis['visits']
462
463 stats = self._createEmptyStats()
464 for (day, stat) in self.current_analysis['days_stats'].items():
465 for k in stats.keys():
466 stats[k] += stat[k]
467
468 duplicated_stats = {k:v for (k,v) in stats.items()}
469
470 cur_time = self.meta_infos['last_time']
471 print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
472 print stats
473
474 if not 'month_stats' in self.current_analysis.keys():
475 self.current_analysis['month_stats'] = stats
476 else:
477 for (k,v) in stats.items():
478 self.current_analysis['month_stats'][k] = v
479
480 self.valid_visitors = {}
481 for (k,v) in visits.items():
482 if v['robot']: continue
483 if conf.count_hit_only_visitors and\
484 (not v['viewed_pages']):
485 continue
486 self.valid_visitors[k] = v
487
488 duplicated_stats['visitors'] = stats['visitors'] = len(self.valid_visitors.keys())
489
490 self._callPlugins(conf.POST_HOOK_DIRECTORY)
491
492 path = self.getDBFilename(cur_time)
493 if os.path.exists(path):
494 os.remove(path)
495
496 print "==> Serialize to %s" % path
497 self._serialize(self.current_analysis, path)
498
499 # Save month stats
500 year = cur_time.tm_year
501 month = cur_time.tm_mon
502 if not 'stats' in self.meta_infos.keys():
503 self.meta_infos['stats'] = {}
504 if not year in self.meta_infos['stats'].keys():
505 self.meta_infos['stats'][year] = {}
506 self.meta_infos['stats'][year][month] = duplicated_stats
507
508 self._generateDisplay()
509
510 def _generateDayStats(self):
511 visits = self.current_analysis['visits']
512 cur_time = self.meta_infos['last_time']
513
514 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
515
516 stats = self._createEmptyStats()
517
518 for (k, super_hit) in visits.items():
519 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
520 continue
521 viewed_page = False
522 for hit in super_hit['requests'][::-1]:
523 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
524 break
525 if super_hit['robot'] or\
526 not int(hit['status']) in conf.viewed_http_codes:
527 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
528 continue
529 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
530 if hit['is_page']:
531 stats['viewed_pages'] += 1
532 viewed_pages = True
533 else:
534 stats['viewed_hits'] += 1
535 if (conf.count_hit_only_visitors or\
536 viewed_pages):
537 stats['nb_visitors'] += 1
538
539 print "== Stats for %d/%02d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
540
541 print stats
542
543 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
544
545 def _newHit(self, hit):
546 t = self._decodeTime(hit)
547
548 cur_time = self.meta_infos['last_time']
549
550 if cur_time == None:
551 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
552 self.analyse_started = True
553 else:
554 if time.mktime(t) < time.mktime(cur_time):
555 return False
556 if not self.analyse_started:
557 self.analyse_started = True
558 if cur_time.tm_mon != t.tm_mon:
559 self._generateMonthStats()
560 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
561 elif cur_time.tm_mday != t.tm_mday:
562 self._generateDayStats()
563
564 self.meta_infos['last_time'] = t
565
566 if not self.meta_infos['start_analysis_time']:
567 self.meta_infos['start_analysis_time'] = t
568
569 if not self._decodeHTTPRequest(hit): return False
570
571 for k in hit.keys():
572 if hit[k] == '-' or hit[k] == '*':
573 hit[k] = ''
574
575 self._appendHit(hit)
576
577 return True
578
579 def start(self, _file):
580 print '==> Load previous database'
581
582 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
583 if self.meta_infos['last_time']:
584 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
585 else:
586 self._clearVisits()
587
588 self.meta_infos['start_analysis_time'] = None
589
590 self.cache_plugins = preloadPlugins(self.plugins, self)
591
592 print '==> Analysing log'
593
594 for l in _file:
595 # print "line " + l
596
597 groups = self.log_re.match(l)
598
599 if groups:
600 if not self._newHit(groups.groupdict()):
601 continue
602 else:
603 print "No match for " + l
604 #break
605
606 if self.analyse_started:
607 self._generateDayStats()
608 self._generateMonthStats()
609 del self.meta_infos['start_analysis_time']
610 self._serialize(self.meta_infos, conf.META_PATH)
611 else:
612 print '==> Analyse not started : nothing to do'
613 self._generateMonthStats()
614
615if __name__ == '__main__':
616 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
617
618 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
619 default=False,
620 help='Clean output before starting')
621
622 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
623 default=False,
624 help='Read data from stdin instead of conf.analyzed_filename')
625
626 args = parser.parse_args()
627
628 if args.clean_output:
629 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
630 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
631
632 iwla = IWLA()
633
634 required_conf = ['analyzed_filename', 'domain_name']
635 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
636 sys.exit(0)
637
638 if args.stdin:
639 iwla.start(sys.stdin)
640 else:
641 if not os.path.exists(conf.analyzed_filename):
642 print 'No such file \'%s\'' % (conf.analyzed_filename)
643 sys.exit(-1)
644 with open(conf.analyzed_filename) as f:
645 iwla.start(f)

Archive Download this file

Branches

Tags