iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages
99 viewed_hits
100 not_viewed_pages
101 not_viewed_hits
102 bandwidth
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.2-dev'
135
136 def __init__(self, logLevel):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144
145 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
146 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
147 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
148 self.log_re = re.compile(self.log_format_extracted)
149 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
150 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
151 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
152 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
153 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
154
155 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
156 self.logger = logging.getLogger(self.__class__.__name__)
157 self.logger.info('==> Start')
158 try:
159 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
160 self.logger.info('\tUsing locale %s' % (conf.locale))
161 except IOError:
162 t = gettext.NullTranslations()
163 self.logger.info('\tUsing default locale en_EN')
164 self._ = t.ugettext
165
166 def getVersion(self):
167 return IWLA.IWLA_VERSION
168
169 def getConfValue(self, key, default=None):
170 if not key in dir(conf):
171 return default
172 else:
173 return conf.__dict__[key]
174
175 def _clearVisits(self):
176 self.current_analysis = {
177 'days_stats' : {},
178 'month_stats' : {},
179 'visits' : {}
180 }
181 self.valid_visitors = None
182 return self.current_analysis
183
184 def getDaysStats(self):
185 return self.current_analysis['days_stats']
186
187 def getMonthStats(self):
188 return self.current_analysis['month_stats']
189
190 def getCurrentVisists(self):
191 return self.current_analysis['visits']
192
193 def getValidVisitors(self):
194 return self.valid_visitors
195
196 def getDisplay(self):
197 return self.display
198
199 def getCurTime(self):
200 return self.meta_infos['last_time']
201
202 def getStartAnalysisTime(self):
203 return self.meta_infos['start_analysis_time']
204
205 def isValidForCurrentAnalysis(self, request):
206 cur_time = self.meta_infos['start_analysis_time']
207 # Analyse not started
208 if not cur_time: return False
209 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
210
211 def hasBeenViewed(self, request):
212 return int(request['status']) in conf.viewed_http_codes
213
214 def getCurDisplayPath(self, filename):
215 cur_time = self.meta_infos['last_time']
216 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
217
218 def getResourcesPath(self):
219 return conf.resources_path
220
221 def getCSSPath(self):
222 return conf.css_path
223
224 def _clearMeta(self):
225 self.meta_infos = {
226 'last_time' : None,
227 'start_analysis_time' : None
228 }
229 return self.meta_infos
230
231 def _clearDisplay(self):
232 self.display = DisplayHTMLBuild(self)
233return self.display
234
235 def getDBFilename(self, time):
236 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
237
238 def _openDB(self, filename, prot='r'):
239 if self.args.dont_compress:
240 return open(filename, prot)
241 else:
242 return gzip.open(filename, prot)
243
244 def _serialize(self, obj, filename):
245 base = os.path.dirname(filename)
246 if not os.path.exists(base):
247 os.makedirs(base)
248
249 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
250 pickle.dump(obj, f)
251 f.seek(0)
252 fzip.write(f.read())
253 os.remove(filename + '.tmp')
254
255 def _deserialize(self, filename):
256 if not os.path.exists(filename):
257 return None
258
259 with self._openDB(filename) as f:
260 return pickle.load(f)
261 return None
262
263 def _callPlugins(self, target_root, *args):
264 self.logger.info('==> Call plugins (%s)' % (target_root))
265 for (root, plugins) in self.plugins:
266 if root != target_root: continue
267 for p in plugins:
268 mod = self.cache_plugins.get(root + '.' + p, None)
269 if mod:
270 self.logger.info('\t%s' % (p))
271 mod.hook(*args)
272
273 def isPage(self, request):
274 self.logger.debug("Is page %s" % (request))
275 for e in conf.pages_extensions:
276 if request.endswith(e):
277 self.logger.debug("True")
278 return True
279 self.logger.debug("False")
280 return False
281
282 def isMultimediaFile(self, request):
283 self.logger.debug("Is multimedia %s" % (request))
284 for e in conf.multimedia_files:
285 if request.endswith(e):
286 self.logger.debug("True")
287 return True
288 self.logger.debug("False")
289 return False
290
291 def _appendHit(self, hit):
292 remote_addr = hit['remote_addr']
293
294 if not remote_addr: return
295
296 if not remote_addr in self.current_analysis['visits'].keys():
297 self._createVisitor(hit)
298
299 super_hit = self.current_analysis['visits'][remote_addr]
300 super_hit['requests'].append(hit)
301 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
302 super_hit['last_access'] = self.meta_infos['last_time']
303
304 request = hit['extract_request']
305
306 uri = request.get('extract_uri', request['http_uri'])
307
308 hit['is_page'] = self.isPage(uri)
309
310 if super_hit['robot'] or\
311 not self.hasBeenViewed(hit):
312 page_key = 'not_viewed_pages'
313 hit_key = 'not_viewed_hits'
314 else:
315 page_key = 'viewed_pages'
316 hit_key = 'viewed_hits'
317
318 if hit['is_page']:
319 super_hit[page_key] += 1
320 else:
321 super_hit[hit_key] += 1
322
323 def _createVisitor(self, hit):
324 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
325 super_hit['remote_addr'] = hit['remote_addr']
326 super_hit['remote_ip'] = hit['remote_addr']
327 super_hit['viewed_pages'] = 0
328 super_hit['viewed_hits'] = 0
329 super_hit['not_viewed_pages'] = 0
330 super_hit['not_viewed_hits'] = 0
331 super_hit['bandwidth'] = 0
332 super_hit['last_access'] = self.meta_infos['last_time']
333 super_hit['requests'] = []
334 super_hit['robot'] = False
335 super_hit['hit_only'] = 0
336
337 def _decodeHTTPRequest(self, hit):
338 if not 'request' in hit.keys(): return False
339
340 groups = self.http_request_extracted.match(hit['request'])
341
342 if groups:
343 hit['extract_request'] = groups.groupdict()
344 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
345 if uri_groups:
346 d = uri_groups.groupdict()
347 hit['extract_request']['extract_uri'] = d['extract_uri']
348 if 'extract_parameters' in d.keys():
349 hit['extract_request']['extract_parameters'] = d['extract_parameters']
350 else:
351 self.logger.warning("Bad request extraction %s" % (hit['request']))
352 return False
353
354 if hit['http_referer']:
355 referer_groups = self.uri_re.match(hit['http_referer'])
356 if referer_groups:
357 hit['extract_referer'] = referer_groups.groupdict()
358 return True
359
360 def _decodeTime(self, hit):
361 try:
362 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
363 except ValueError, e:
364 if sys.version_info < (3, 2):
365 # Try without UTC value at the end (%z not recognized)
366 gmt_offset_str = hit['time_local'][-5:]
367 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
368 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
369 gmt_offset = gmt_offset_hours + gmt_offset_minutes
370 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
371 # if gmt_offset_str[0] == '-':
372 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
373 # else:
374 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
375 else:
376 raise e
377 return hit['time_decoded']
378
379 def getDisplayIndex(self):
380 cur_time = self.meta_infos['last_time']
381 filename = self.getCurDisplayPath('index.html')
382
383 return self.display.getPage(filename)
384
385 def _generateDisplayDaysStats(self):
386 cur_time = self.meta_infos['last_time']
387 title = createCurTitle(self, self._('Statistics'))
388 filename = self.getCurDisplayPath('index.html')
389 self.logger.info('==> Generate display (%s)' % (filename))
390 page = self.display.createPage(title, filename, conf.css_path)
391 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
392 page.appendBlock(link)
393
394 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
395 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
396 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
397 nb_visits = 0
398 nb_days = 0
399 for i in range(1, nb_month_days+1):
400 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
401 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
402 if i in self.current_analysis['days_stats'].keys():
403 stats = self.current_analysis['days_stats'][i]
404 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
405 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
406 nb_visits += stats['nb_visits']
407 nb_days += 1
408 else:
409 row = [full_day, 0, 0, 0, 0, 0]
410 days.appendRow(row)
411 days.setCellValue(i-1, 4, bytesToStr(row[4]))
412 days.setCellValue(i-1, 5, bytesToStr(row[5]))
413 days.appendShortTitle(day)
414 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
415 week_day = adate.weekday()
416 if week_day == 5 or week_day == 6:
417 days.setRowCSSClass(i-1, 'iwla_weekend')
418 if adate == date.today():
419 css = days.getCellCSSClass(i-1, 0)
420 if css: css = '%s %s' % (css, 'iwla_curday')
421 else: css = 'iwla_curday'
422 days.setCellCSSClass(i-1, 0, css)
423
424 stats = self.current_analysis['month_stats']
425
426 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
427 if nb_days:
428 average_row = map(lambda(v): int(v/nb_days), row)
429 else:
430 average_row = map(lambda(v): 0, row)
431
432 average_row[0] = self._('Average')
433 average_row[4] = bytesToStr(average_row[4])
434 average_row[5] = bytesToStr(average_row[5])
435 days.appendRow(average_row)
436
437 row[0] = self._('Total')
438 row[4] = bytesToStr(row[4])
439 row[5] = bytesToStr(row[5])
440 days.appendRow(row)
441 page.appendBlock(days)
442 self.display.addPage(page)
443
444 def _generateDisplayMonthStats(self, page, year, month_stats):
445 cur_time = time.localtime()
446 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
447 title = '%s %d' % (self._('Summary'), year)
448 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
449 graph_cols=range(1,7)
450 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
451 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
452 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
453 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
454 total = [0] * len(cols)
455 for i in range(1, 13):
456 month = '%s<br/>%d' % (months_name[i], year)
457 full_month = '%s %d' % (months_name[i], year)
458 if i in month_stats.keys():
459 stats = month_stats[i]
460 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
461 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
462 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
463 for j in graph_cols:
464 total[j] += row[j]
465 else:
466 row = [full_month, 0, 0, 0, 0, 0, 0, '']
467 months.appendRow(row)
468 months.setCellValue(i-1, 5, bytesToStr(row[5]))
469 months.setCellValue(i-1, 6, bytesToStr(row[6]))
470 months.appendShortTitle(month)
471 months_.appendRow(row[:-1])
472 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
473 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
474 months_.appendShortTitle(month)
475 if year == cur_time.tm_year and i == cur_time.tm_mon:
476 css = months.getCellCSSClass(i-1, 0)
477 if css: css = '%s %s' % (css, 'iwla_curday')
478 else: css = 'iwla_curday'
479 months.setCellCSSClass(i-1, 0, css)
480 months_.setCellCSSClass(i-1, 0, css)
481
482 total[0] = self._('Total')
483 total[5] = bytesToStr(total[5])
484 total[6] = bytesToStr(total[6])
485 total[7] = u''
486 months.appendRow(total)
487 page.appendBlock(months)
488
489 months_.appendRow(total[:-1])
490 filename = '%d/_stats.html' % (year)
491 page_ = self.display.createPage(u'', filename, conf.css_path)
492 page_.appendBlock(months_)
493 page_.build(conf.DISPLAY_ROOT, False)
494
495 def _generateDisplayWholeMonthStats(self):
496 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
497 filename = 'index.html'
498
499 self.logger.info('==> Generate main page (%s)' % (filename))
500
501 page = self.display.createPage(title, filename, conf.css_path)
502
503 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
504 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
505 duration = datetime.now() - self.start_time
506 duration = time.gmtime(duration.seconds)
507 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
508 if duration.tm_hour:
509 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
510 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
511 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
512
513 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
514 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
515
516 self.display.addPage(page)
517
518 def _compressFile(self, build_time, root, filename):
519 path = os.path.join(root, filename)
520 gz_path = path + '.gz'
521
522 self.logger.debug('Compress %s => %s' % (path, gz_path))
523
524 if not os.path.exists(gz_path) or\
525 os.stat(path).st_mtime > build_time:
526 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
527 f_out.write(f_in.read())
528
529 def _compressFiles(self, build_time, root):
530 if not conf.compress_output_files: return
531 for rootdir, subdirs, files in os.walk(root, followlinks=True):
532 for f in files:
533 for ext in conf.compress_output_files:
534 if f.endswith(ext):
535 self._compressFile(build_time, rootdir, f)
536 break
537
538 def _generateDisplay(self):
539 self._generateDisplayDaysStats()
540 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
541 self._generateDisplayWholeMonthStats()
542 build_time = time.localtime()
543 self.display.build(conf.DISPLAY_ROOT)
544 self._compressFiles(build_time, conf.DISPLAY_ROOT)
545
546 def _createEmptyStats(self):
547 stats = {}
548 stats['viewed_bandwidth'] = 0
549 stats['not_viewed_bandwidth'] = 0
550 stats['viewed_pages'] = 0
551 stats['viewed_hits'] = 0
552 stats['nb_visits'] = 0
553
554 return stats
555
556 def _generateMonthStats(self):
557 self._clearDisplay()
558
559 visits = self.current_analysis['visits']
560
561 stats = self._createEmptyStats()
562 for (day, stat) in self.current_analysis['days_stats'].items():
563 for k in stats.keys():
564 stats[k] += stat[k]
565
566 duplicated_stats = {k:v for (k,v) in stats.items()}
567
568 cur_time = self.meta_infos['last_time']
569 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
570 self.logger.info(stats)
571
572 if not 'month_stats' in self.current_analysis.keys():
573 self.current_analysis['month_stats'] = stats
574 else:
575 for (k,v) in stats.items():
576 self.current_analysis['month_stats'][k] = v
577
578 self.valid_visitors = {}
579 for (k,v) in visits.items():
580 if v['robot']: continue
581 if not (conf.count_hit_only_visitors or\
582 v['viewed_pages']):
583 continue
584 self.valid_visitors[k] = v
585
586 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
587
588 self._callPlugins(conf.POST_HOOK_DIRECTORY)
589
590 path = self.getDBFilename(cur_time)
591 if os.path.exists(path):
592 os.remove(path)
593
594 self.logger.info("==> Serialize to %s" % (path))
595 self._serialize(self.current_analysis, path)
596
597 # Save month stats
598 year = cur_time.tm_year
599 month = cur_time.tm_mon
600 if not 'stats' in self.meta_infos.keys():
601 self.meta_infos['stats'] = {}
602 if not year in self.meta_infos['stats'].keys():
603 self.meta_infos['stats'][year] = {}
604 self.meta_infos['stats'][year][month] = duplicated_stats
605
606 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
607 self._serialize(self.meta_infos, conf.META_PATH)
608
609 self._generateDisplay()
610
611 def _generateDayStats(self):
612 visits = self.current_analysis['visits']
613 cur_time = self.meta_infos['last_time']
614
615 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
616
617 stats = self._createEmptyStats()
618
619 for (k, super_hit) in visits.items():
620 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
621 continue
622 viewed_page = False
623 for hit in super_hit['requests'][::-1]:
624 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
625 break
626 if super_hit['robot'] or\
627 not self.hasBeenViewed(hit):
628 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
629 continue
630 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
631 if hit['is_page']:
632 stats['viewed_pages'] += 1
633 viewed_pages = True
634 else:
635 stats['viewed_hits'] += 1
636 if (conf.count_hit_only_visitors or\
637 viewed_pages) and\
638 not super_hit['robot']:
639 stats['nb_visits'] += 1
640
641 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
642 self.logger.info(stats)
643
644 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
645
646 def _newHit(self, hit):
647 if not self.domain_name_re.match(hit['server_name']):
648 self.logger.debug("Not in domain %s" % (hit))
649 return False
650
651 t = self._decodeTime(hit)
652
653 cur_time = self.meta_infos['last_time']
654
655 if cur_time == None:
656 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
657 self.analyse_started = True
658 else:
659 if not self.analyse_started and\
660 time.mktime(t) <= time.mktime(cur_time):
661 self.logger.debug("Not in time")
662 return False
663 self.analyse_started = True
664 if cur_time.tm_mon != t.tm_mon:
665 self._generateDayStats()
666 self._generateMonthStats()
667 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
668 elif cur_time.tm_mday != t.tm_mday:
669 self._generateDayStats()
670
671 self.meta_infos['last_time'] = t
672
673 if not self.meta_infos['start_analysis_time']:
674 self.meta_infos['start_analysis_time'] = t
675
676 if not self._decodeHTTPRequest(hit): return False
677
678 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
679 return False
680
681 for k in hit.keys():
682 if hit[k] == '-' or hit[k] == '*':
683 hit[k] = ''
684
685 self._appendHit(hit)
686
687 return True
688
689 def _reset(self):
690 reset_time = time.strptime(self.args.reset, '%m/%Y')
691
692 self.logger.info('Reset time')
693 self.logger.info(reset_time)
694
695 self.meta_infos['last_time'] = reset_time
696
697 cur_time = time.localtime()
698 year = reset_time.tm_year
699 while year < cur_time.tm_year:
700 db_path = os.path.join(conf.DB_ROOT, str(year))
701 if os.path.exists(db_path): shutil.rmtree(db_path)
702 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
703 if os.path.exists(output_path): shutil.rmtree(output_path)
704 year += 1
705 month = reset_time.tm_mon
706 while month <= cur_time.tm_mon:
707 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
708 if os.path.exists(db_path): shutil.rmtree(db_path)
709 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
710 if os.path.exists(output_path): shutil.rmtree(output_path)
711 month += 1
712
713 def start(self, _file, args):
714 self.args = args
715 self.start_time = datetime.now()
716
717 self.logger.info('==> Load previous database')
718
719 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
720 if self.meta_infos['last_time']:
721 if args.reset:
722 self._reset()
723 self.logger.info('Last time')
724 self.logger.info(self.meta_infos['last_time'])
725 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
726 else:
727 self._clearVisits()
728
729 self.meta_infos['start_analysis_time'] = None
730
731 self.cache_plugins = preloadPlugins(self.plugins, self)
732
733 self.logger.info('==> Analysing log')
734
735 for l in _file:
736 # print "line " + l
737
738 groups = self.log_re.match(l)
739
740 if groups:
741 self._newHit(groups.groupdict())
742 else:
743 self.logger.warning("No match for %s" % (l))
744 #break
745
746 if self.analyse_started:
747 self._generateDayStats()
748 self._generateMonthStats()
749 del self.meta_infos['start_analysis_time']
750 else:
751 self.logger.info('==> Analyse not started : nothing new')
752
753
754class FileIter(object):
755 def __init__(self, filenames):
756 self.filenames = [f for f in filenames.split(',') if f]
757 for f in self.filenames:
758 if not os.path.exists(f):
759 print 'No such file \'%s\'' % (f)
760 sys.exit(-1)
761 self.cur_file = None
762 self._openNextFile()
763
764 def __iter__(self):
765 return self
766
767 def __next__(self):
768 return self.next()
769
770 def _openNextFile(self):
771 if self.cur_file:
772 self.cur_file.close()
773 self.cur_file = None
774 if not self.filenames:
775 raise StopIteration()
776 filename = self.filenames.pop(0)
777 if filename.endswith('gz'):
778 self.cur_file = gzip.open(filename, 'r')
779 else:
780 self.cur_file = open(filename)
781
782 def next(self):
783 l = self.cur_file.readline()
784 if not l:
785 self._openNextFile()
786 l = self.cur_file.readline()
787 return l[:-1]
788
789if __name__ == '__main__':
790 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
791
792 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
793 default=False,
794 help='Clean output before starting')
795
796 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
797 default=False,
798 help='Read data from stdin instead of conf.analyzed_filename')
799
800 parser.add_argument('-f', '--file', dest='file',
801 help='Analyse this log file')
802
803 parser.add_argument('-d', '--log-level', dest='loglevel',
804 default='INFO', type=str,
805 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
806
807 parser.add_argument('-r', '--reset', dest='reset', action='store_true',
808 default=False,
809 help='Reset analysis to a specific date (month/year)')
810
811 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
812 default=False,
813 help='Don\'t compress databases (bigger but faster)')
814
815 args = parser.parse_args()
816
817 # Load user conf
818 for (k,v) in user_conf.__dict__.items():
819 if k.endswith('_append'):
820 new_k = k[:-7]
821 if new_k in dir(conf):
822 if type(conf.__dict__[new_k]) == list:
823 if type(v) == list:
824 conf.__dict__[new_k] += v
825 else:
826 conf.__dict__[new_k].append(v)
827 else:
828 print("Error %s is not a list" % (new_k))
829 else:
830 print("Error %s doesn't exists in default conf" % (new_k))
831 else:
832 conf.__dict__.update({k:v})
833
834 if args.clean_output:
835 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
836 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
837
838 loglevel = getattr(logging, args.loglevel.upper(), None)
839 if not isinstance(loglevel, int):
840 raise ValueError('Invalid log level: %s' % (args.loglevel))
841
842 iwla = IWLA(loglevel)
843
844 required_conf = ['analyzed_filename', 'domain_name']
845 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
846 sys.exit(0)
847
848 if args.stdin:
849 iwla.start(sys.stdin, args)
850 else:
851 filename = args.file or conf.analyzed_filename
852 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags