iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date
35
36import default_conf as conf
37import conf as _
38conf.__dict__.update(_.__dict__)
39del _
40
41from iplugin import *
42from display import *
43
44"""
45Main class IWLA
46Parse Log, compute them, call plugins and produce output
47For now, only HTTP log are valid
48
49Plugin requirements :
50 None
51
52Conf values needed :
53 analyzed_filename
54 domain_name
55 locales_path
56 compress_output_files*
57
58Output files :
59 DB_ROOT/meta.db
60 DB_ROOT/year/month/iwla.db
61 OUTPUT_ROOT/index.html
62 OUTPUT_ROOT/year/_stats.html
63 OUTPUT_ROOT/year/month/index.html
64
65Statistics creation :
66
67meta :
68 last_time
69 start_analysis_time
70 stats =>
71 year =>
72 month =>
73 viewed_bandwidth
74 not_viewed_bandwidth
75 viewed_pages
76 viewed_hits
77 nb_visits
78 nb_visitors
79
80month_stats :
81 viewed_bandwidth
82 not_viewed_bandwidth
83 viewed_pages
84 viewed_hits
85 nb_visits
86
87days_stats :
88 day =>
89 viewed_bandwidth
90 not_viewed_bandwidth
91 viewed_pages
92 viewed_hits
93 nb_visits
94 nb_visitors
95
96visits :
97 remote_addr =>
98 remote_addr
99 remote_ip
100 viewed_pages
101 viewed_hits
102 not_viewed_pages
103 not_viewed_hits
104 bandwidth
105 last_access
106 requests =>
107 [fields_from_format_log]
108 extract_request =>
109 extract_uri
110 extract_parameters*
111 extract_referer* =>
112 extract_uri
113 extract_parameters*
114 robot
115 hit_only
116 is_page
117
118valid_visitors:
119 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
120
121Statistics update :
122 None
123
124Statistics deletion :
125 None
126"""
127
128
129class IWLA(object):
130
131 ANALYSIS_CLASS = 'HTTP'
132 API_VERSION = 1
133 IWLA_VERSION = '0.2-dev'
134
135 def __init__(self, logLevel):
136 self.meta_infos = {}
137 self.analyse_started = False
138 self.current_analysis = {}
139 self.cache_plugins = {}
140 self.display = DisplayHTMLBuild(self)
141 self.valid_visitors = None
142
143 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
144 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
145 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
146 self.log_re = re.compile(self.log_format_extracted)
147 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
148 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
149 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
150 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
151 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
152
153 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
154 self.logger = logging.getLogger(self.__class__.__name__)
155 self.logger.info('==> Start')
156 try:
157 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
158 self.logger.info('\tUsing locale %s' % (conf.locale))
159 except IOError:
160 t = gettext.NullTranslations()
161 self.logger.info('\tUsing default locale en_EN')
162 self._ = t.ugettext
163
164 def getVersion(self):
165 return IWLA.IWLA_VERSION
166
167 def getConfValue(self, key, default=None):
168 if not key in dir(conf):
169 return default
170 else:
171 return conf.__dict__[key]
172
173 def _clearVisits(self):
174 self.current_analysis = {
175 'days_stats' : {},
176 'month_stats' : {},
177 'visits' : {}
178 }
179 self.valid_visitors = None
180 return self.current_analysis
181
182 def getDaysStats(self):
183 return self.current_analysis['days_stats']
184
185 def getMonthStats(self):
186 return self.current_analysis['month_stats']
187
188 def getCurrentVisists(self):
189 return self.current_analysis['visits']
190
191 def getValidVisitors(self):
192 return self.valid_visitors
193
194 def getDisplay(self):
195 return self.display
196
197 def getCurTime(self):
198 return self.meta_infos['last_time']
199
200 def getStartAnalysisTime(self):
201 return self.meta_infos['start_analysis_time']
202
203 def isValidForCurrentAnalysis(self, request):
204 cur_time = self.meta_infos['start_analysis_time']
205 # Analyse not started
206 if not cur_time: return False
207 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
208
209 def hasBeenViewed(self, request):
210 return int(request['status']) in conf.viewed_http_codes
211
212 def getCurDisplayPath(self, filename):
213 cur_time = self.meta_infos['last_time']
214 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
215
216 def getResourcesPath(self):
217 return conf.resources_path
218
219 def getCSSPath(self):
220 return conf.css_path
221
222 def _clearMeta(self):
223 self.meta_infos = {
224 'last_time' : None,
225 'start_analysis_time' : None
226 }
227 return self.meta_infos
228
229 def _clearDisplay(self):
230 self.display = DisplayHTMLBuild(self)
231return self.display
232
233 def getDBFilename(self, time):
234 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
235
236 def _serialize(self, obj, filename):
237 base = os.path.dirname(filename)
238 if not os.path.exists(base):
239 os.makedirs(base)
240
241 # TODO : remove return
242 #return
243
244 with open(filename + '.tmp', 'wb+') as f, gzip.open(filename, 'w') as fzip:
245 pickle.dump(obj, f)
246 f.seek(0)
247 fzip.write(f.read())
248 os.remove(filename + '.tmp')
249
250 def _deserialize(self, filename):
251 if not os.path.exists(filename):
252 return None
253
254 with gzip.open(filename, 'r') as f:
255 return pickle.load(f)
256 return None
257
258 def _callPlugins(self, target_root, *args):
259 self.logger.info('==> Call plugins (%s)' % (target_root))
260 for (root, plugins) in self.plugins:
261 if root != target_root: continue
262 for p in plugins:
263 mod = self.cache_plugins.get(root + '.' + p, None)
264 if mod:
265 self.logger.info('\t%s' % (p))
266 mod.hook(*args)
267
268 def isPage(self, request):
269 for e in conf.pages_extensions:
270 if request.endswith(e):
271 return True
272
273 return False
274
275 def _appendHit(self, hit):
276 remote_addr = hit['remote_addr']
277
278 if not remote_addr: return
279
280 if not remote_addr in self.current_analysis['visits'].keys():
281 self._createVisitor(hit)
282
283 super_hit = self.current_analysis['visits'][remote_addr]
284 super_hit['requests'].append(hit)
285 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
286 super_hit['last_access'] = self.meta_infos['last_time']
287
288 request = hit['extract_request']
289
290 uri = request.get('extract_uri', request['http_uri'])
291
292 hit['is_page'] = self.isPage(uri)
293
294 if super_hit['robot'] or\
295 not self.hasBeenViewed(hit):
296 page_key = 'not_viewed_pages'
297 hit_key = 'not_viewed_hits'
298 else:
299 page_key = 'viewed_pages'
300 hit_key = 'viewed_hits'
301
302 if hit['is_page']:
303 super_hit[page_key] += 1
304 else:
305 super_hit[hit_key] += 1
306
307 def _createVisitor(self, hit):
308 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
309 super_hit['remote_addr'] = hit['remote_addr']
310 super_hit['remote_ip'] = hit['remote_addr']
311 super_hit['viewed_pages'] = 0
312 super_hit['viewed_hits'] = 0
313 super_hit['not_viewed_pages'] = 0
314 super_hit['not_viewed_hits'] = 0
315 super_hit['bandwidth'] = 0
316 super_hit['last_access'] = self.meta_infos['last_time']
317 super_hit['requests'] = []
318 super_hit['robot'] = False
319 super_hit['hit_only'] = 0
320
321 def _decodeHTTPRequest(self, hit):
322 if not 'request' in hit.keys(): return False
323
324 groups = self.http_request_extracted.match(hit['request'])
325
326 if groups:
327 hit['extract_request'] = groups.groupdict()
328 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
329 if uri_groups:
330 d = uri_groups.groupdict()
331 hit['extract_request']['extract_uri'] = d['extract_uri']
332 if 'extract_parameters' in d.keys():
333 hit['extract_request']['extract_parameters'] = d['extract_parameters']
334 else:
335 self.logger.warning("Bad request extraction %s" % (hit['request']))
336 return False
337
338 if hit['http_referer']:
339 referer_groups = self.uri_re.match(hit['http_referer'])
340 if referer_groups:
341 hit['extract_referer'] = referer_groups.groupdict()
342 return True
343
344 def _decodeTime(self, hit):
345 try:
346 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
347 except ValueError, e:
348 if sys.version_info < (3, 2):
349 # Try without UTC value at the end (%z not recognized)
350 gmt_offset_str = hit['time_local'][-5:]
351 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
352 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
353 gmt_offset = gmt_offset_hours + gmt_offset_minutes
354 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
355 if gmt_offset_str[0] == '-':
356 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
357 else:
358 hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
359 else:
360 raise e
361 return hit['time_decoded']
362
363 def getDisplayIndex(self):
364 cur_time = self.meta_infos['last_time']
365 filename = self.getCurDisplayPath('index.html')
366
367 return self.display.getPage(filename)
368
369 def _generateDisplayDaysStats(self):
370 cur_time = self.meta_infos['last_time']
371 title = createCurTitle(self, self._('Statistics'))
372 filename = self.getCurDisplayPath('index.html')
373 self.logger.info('==> Generate display (%s)' % (filename))
374 page = self.display.createPage(title, filename, conf.css_path)
375 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
376 page.appendBlock(link)
377
378 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
379 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
380 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
381 nb_visits = 0
382 nb_days = 0
383 for i in range(1, nb_month_days+1):
384 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
385 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
386 if i in self.current_analysis['days_stats'].keys():
387 stats = self.current_analysis['days_stats'][i]
388 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
389 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
390 nb_visits += stats['nb_visits']
391 nb_days += 1
392 else:
393 row = [full_day, 0, 0, 0, 0, 0]
394 days.appendRow(row)
395 days.setCellValue(i-1, 4, bytesToStr(row[4]))
396 days.setCellValue(i-1, 5, bytesToStr(row[5]))
397 days.appendShortTitle(day)
398 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
399 week_day = adate.weekday()
400 if week_day == 5 or week_day == 6:
401 days.setRowCSSClass(i-1, 'iwla_weekend')
402 if adate == date.today():
403 css = days.getCellCSSClass(i-1, 0)
404 if css: css = '%s %s' % (css, 'iwla_curday')
405 else: css = 'iwla_curday'
406 days.setCellCSSClass(i-1, 0, css)
407
408 stats = self.current_analysis['month_stats']
409
410 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
411 if nb_days:
412 average_row = map(lambda(v): int(v/nb_days), row)
413 else:
414 average_row = map(lambda(v): 0, row)
415
416 average_row[0] = self._('Average')
417 average_row[4] = bytesToStr(average_row[4])
418 average_row[5] = bytesToStr(average_row[5])
419 days.appendRow(average_row)
420
421 row[0] = self._('Total')
422 row[4] = bytesToStr(row[4])
423 row[5] = bytesToStr(row[5])
424 days.appendRow(row)
425 page.appendBlock(days)
426 self.display.addPage(page)
427
428 def _generateDisplayMonthStats(self, page, year, month_stats):
429 cur_time = time.localtime()
430 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
431 title = '%s %d' % (self._('Summary'), year)
432 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
433 graph_cols=range(1,7)
434 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
435 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
436 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
437 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
438 total = [0] * len(cols)
439 for i in range(1, 13):
440 month = '%s<br/>%d' % (months_name[i], year)
441 full_month = '%s %d' % (months_name[i], year)
442 if i in month_stats.keys():
443 stats = month_stats[i]
444 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
445 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
446 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
447 for j in graph_cols:
448 total[j] += row[j]
449 else:
450 row = [full_month, 0, 0, 0, 0, 0, 0, '']
451 months.appendRow(row)
452 months.setCellValue(i-1, 5, bytesToStr(row[5]))
453 months.setCellValue(i-1, 6, bytesToStr(row[6]))
454 months.appendShortTitle(month)
455 months_.appendRow(row[:-1])
456 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
457 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
458 months_.appendShortTitle(month)
459 if year == cur_time.tm_year and i == cur_time.tm_mon:
460 css = months.getCellCSSClass(i-1, 0)
461 if css: css = '%s %s' % (css, 'iwla_curday')
462 else: css = 'iwla_curday'
463 months.setCellCSSClass(i-1, 0, css)
464 months_.setCellCSSClass(i-1, 0, css)
465
466 total[0] = self._('Total')
467 total[5] = bytesToStr(total[5])
468 total[6] = bytesToStr(total[6])
469 total[7] = u''
470 months.appendRow(total)
471 page.appendBlock(months)
472
473 months_.appendRow(total[:-1])
474 filename = '%d/_stats.html' % (year)
475 page_ = self.display.createPage(u'', filename, conf.css_path)
476 page_.appendBlock(months_)
477 page_.build(conf.DISPLAY_ROOT, False)
478
479 def _generateDisplayWholeMonthStats(self):
480 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
481 filename = 'index.html'
482
483 self.logger.info('==> Generate main page (%s)' % (filename))
484
485 page = self.display.createPage(title, filename, conf.css_path)
486
487 last_update = '<b>%s</b> %s<br />' % (self._('Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
488 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
489
490 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
491 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
492
493 self.display.addPage(page)
494
495 def _compressFile(self, build_time, root, filename):
496 path = os.path.join(root, filename)
497 gz_path = path + '.gz'
498
499 self.logger.debug('Compress %s => %s' % (path, gz_path))
500
501 if not os.path.exists(gz_path) or\
502 os.stat(path).st_mtime > build_time:
503 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
504 f_out.write(f_in.read())
505
506 def _compressFiles(self, build_time, root):
507 if not conf.compress_output_files: return
508 for rootdir, subdirs, files in os.walk(root, followlinks=True):
509 for f in files:
510 for ext in conf.compress_output_files:
511 if f.endswith(ext):
512 self._compressFile(build_time, rootdir, f)
513 break
514
515 def _generateDisplay(self):
516 self._generateDisplayDaysStats()
517 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
518 self._generateDisplayWholeMonthStats()
519 build_time = time.localtime()
520 self.display.build(conf.DISPLAY_ROOT)
521 self._compressFiles(build_time, conf.DISPLAY_ROOT)
522
523 def _createEmptyStats(self):
524 stats = {}
525 stats['viewed_bandwidth'] = 0
526 stats['not_viewed_bandwidth'] = 0
527 stats['viewed_pages'] = 0
528 stats['viewed_hits'] = 0
529 stats['nb_visits'] = 0
530
531 return stats
532
533 def _generateMonthStats(self):
534 self._clearDisplay()
535
536 visits = self.current_analysis['visits']
537
538 stats = self._createEmptyStats()
539 for (day, stat) in self.current_analysis['days_stats'].items():
540 for k in stats.keys():
541 stats[k] += stat[k]
542
543 duplicated_stats = {k:v for (k,v) in stats.items()}
544
545 cur_time = self.meta_infos['last_time']
546 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
547 self.logger.info(stats)
548
549 if not 'month_stats' in self.current_analysis.keys():
550 self.current_analysis['month_stats'] = stats
551 else:
552 for (k,v) in stats.items():
553 self.current_analysis['month_stats'][k] = v
554
555 self.valid_visitors = {}
556 for (k,v) in visits.items():
557 if v['robot']: continue
558 if not (conf.count_hit_only_visitors or\
559 v['viewed_pages']):
560 continue
561 self.valid_visitors[k] = v
562
563 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
564
565 self._callPlugins(conf.POST_HOOK_DIRECTORY)
566
567 path = self.getDBFilename(cur_time)
568 if os.path.exists(path):
569 os.remove(path)
570
571 self.logger.info("==> Serialize to %s" % (path))
572 self._serialize(self.current_analysis, path)
573
574 # Save month stats
575 year = cur_time.tm_year
576 month = cur_time.tm_mon
577 if not 'stats' in self.meta_infos.keys():
578 self.meta_infos['stats'] = {}
579 if not year in self.meta_infos['stats'].keys():
580 self.meta_infos['stats'][year] = {}
581 self.meta_infos['stats'][year][month] = duplicated_stats
582
583 self._generateDisplay()
584
585 def _generateDayStats(self):
586 visits = self.current_analysis['visits']
587 cur_time = self.meta_infos['last_time']
588
589 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
590
591 stats = self._createEmptyStats()
592
593 for (k, super_hit) in visits.items():
594 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
595 continue
596 viewed_page = False
597 for hit in super_hit['requests'][::-1]:
598 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
599 break
600 if super_hit['robot'] or\
601 not self.hasBeenViewed(hit):
602 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
603 continue
604 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
605 if hit['is_page']:
606 stats['viewed_pages'] += 1
607 viewed_pages = True
608 else:
609 stats['viewed_hits'] += 1
610 if (conf.count_hit_only_visitors or\
611 viewed_pages) and\
612 not super_hit['robot']:
613 stats['nb_visits'] += 1
614
615 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
616 self.logger.info(stats)
617
618 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
619
620 def _newHit(self, hit):
621 if not self.domain_name_re.match(hit['server_name']):
622 return False
623
624 t = self._decodeTime(hit)
625
626 cur_time = self.meta_infos['last_time']
627
628 if cur_time == None:
629 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
630 self.analyse_started = True
631 else:
632 if time.mktime(t) <= time.mktime(cur_time):
633 return False
634 self.analyse_started = True
635 if cur_time.tm_mon != t.tm_mon:
636 self._generateMonthStats()
637 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
638 elif cur_time.tm_mday != t.tm_mday:
639 self._generateDayStats()
640
641 self.meta_infos['last_time'] = t
642
643 if not self.meta_infos['start_analysis_time']:
644 self.meta_infos['start_analysis_time'] = t
645
646 if not self._decodeHTTPRequest(hit): return False
647
648 for k in hit.keys():
649 if hit[k] == '-' or hit[k] == '*':
650 hit[k] = ''
651
652 self._appendHit(hit)
653
654 return True
655
656 def start(self, _file):
657 self.logger.info('==> Load previous database')
658
659 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
660 if self.meta_infos['last_time']:
661 self.logger.info('Last time')
662 self.logger.info(self.meta_infos['last_time'])
663 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
664 else:
665 self._clearVisits()
666
667 self.meta_infos['start_analysis_time'] = None
668
669 self.cache_plugins = preloadPlugins(self.plugins, self)
670
671 self.logger.info('==> Analysing log')
672
673 for l in _file:
674 # print "line " + l
675
676 groups = self.log_re.match(l)
677
678 if groups:
679 self._newHit(groups.groupdict())
680 else:
681 self.logger.warning("No match for %s" % (l))
682 #break
683
684 if self.analyse_started:
685 self._generateDayStats()
686 self._generateMonthStats()
687 del self.meta_infos['start_analysis_time']
688 self._serialize(self.meta_infos, conf.META_PATH)
689 else:
690 self.logger.info('==> Analyse not started : nothing new')
691
692if __name__ == '__main__':
693 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
694
695 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
696 default=False,
697 help='Clean output before starting')
698
699 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
700 default=False,
701 help='Read data from stdin instead of conf.analyzed_filename')
702
703 parser.add_argument('-f', '--file', dest='file',
704 help='Analyse this log file')
705
706 parser.add_argument('-d', '--log-level', dest='loglevel',
707 default='INFO', type=str,
708 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
709
710 args = parser.parse_args()
711
712 if args.clean_output:
713 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
714 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
715
716 loglevel = getattr(logging, args.loglevel.upper(), None)
717 if not isinstance(loglevel, int):
718 raise ValueError('Invalid log level: %s' % (args.loglevel))
719
720 iwla = IWLA(loglevel)
721
722 required_conf = ['analyzed_filename', 'domain_name']
723 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
724 sys.exit(0)
725
726 if args.stdin:
727 iwla.start(sys.stdin)
728 else:
729 filename = args.file or conf.analyzed_filename
730 if not os.path.exists(filename):
731 print 'No such file \'%s\'' % (filename)
732 sys.exit(-1)
733 with open(filename) as f:
734 iwla.start(f)

Archive Download this file

Branches

Tags