iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import cPickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files
55 excluded_ip
56
57Output files :
58 DB_ROOT/meta.db
59 DB_ROOT/year/month/iwla.db
60 OUTPUT_ROOT/index.html
61 OUTPUT_ROOT/year/_stats.html
62 OUTPUT_ROOT/year/month/index.html
63
64Statistics creation :
65
66meta :
67 last_time
68 start_analysis_time
69 stats =>
70 year =>
71 month =>
72 viewed_bandwidth
73 not_viewed_bandwidth
74 viewed_pages
75 viewed_hits
76 nb_visits
77 nb_visitors
78
79month_stats :
80 viewed_bandwidth
81 not_viewed_bandwidth
82 viewed_pages
83 viewed_hits
84 nb_visits
85
86days_stats :
87 day =>
88 viewed_bandwidth
89 not_viewed_bandwidth
90 viewed_pages
91 viewed_hits
92 nb_visits
93 nb_visitors
94
95visits :
96 remote_addr =>
97 remote_addr
98 remote_ip
99 viewed_pages{0..31} # 0 contains total
100 viewed_hits{0..31} # 0 contains total
101 not_viewed_pages{0..31}
102 not_viewed_hits{0..31}
103 bandwidth{0..31}
104 last_access
105 requests =>
106 [fields_from_format_log]
107 extract_request =>
108 http_method
109 http_uri
110 http_version
111 extract_uri
112 extract_parameters*
113 extract_referer* =>
114 extract_uri
115 extract_parameters*
116 robot
117 hit_only
118 is_page
119
120valid_visitors:
121 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
122
123Statistics update :
124 None
125
126Statistics deletion :
127 None
128"""
129
130
131class IWLA(object):
132
133 ANALYSIS_CLASS = 'HTTP'
134 API_VERSION = 1
135 IWLA_VERSION = '0.5-dev'
136
137 def __init__(self, logLevel, dry_run):
138 self.meta_infos = {}
139 self.analyse_started = False
140 self.current_analysis = {}
141 self.start_time = 0
142 self.cache_plugins = {}
143 self.display = DisplayHTMLBuild(self)
144 self.valid_visitors = None
145 self.dry_run = dry_run
146
147 self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
148 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
149 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
150 self.log_re = re.compile(self.log_format_extracted)
151 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
152 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
153 self.final_slashes_re = re.compile(r'/+$')
154 self.excluded_ip = []
155 for ip in conf.excluded_ip:
156 self.excluded_ip += [re.compile(ip)]
157 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
158 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
159 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
160
161 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
162 self.logger = logging.getLogger(self.__class__.__name__)
163 if self.dry_run:
164 self.logger.info('==> Start (DRY RUN)')
165 else:
166 self.logger.info('==> Start')
167 try:
168 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
169 self.logger.info('\tUsing locale %s' % (conf.locale))
170 except IOError:
171 t = gettext.NullTranslations()
172 self.logger.info('\tUsing default locale en_EN')
173 self._ = t.ugettext
174
175 def getVersion(self):
176 return IWLA.IWLA_VERSION
177
178 def getConfValue(self, key, default=None):
179 if not key in dir(conf):
180 return default
181 else:
182 return conf.__dict__[key]
183
184 def _clearVisits(self):
185 self.current_analysis = {
186 'days_stats' : {},
187 'month_stats' : {},
188 'visits' : {}
189 }
190 self.valid_visitors = None
191 return self.current_analysis
192
193 def getDaysStats(self):
194 return self.current_analysis['days_stats']
195
196 def getMonthStats(self):
197 return self.current_analysis['month_stats']
198
199 def getCurrentVisits(self):
200 return self.current_analysis['visits']
201
202 def getValidVisitors(self):
203 return self.valid_visitors
204
205 def getDisplay(self):
206 return self.display
207
208 def getCurTime(self):
209 return self.meta_infos['last_time']
210
211 def getStartAnalysisTime(self):
212 return self.meta_infos['start_analysis_time']
213
214 def isValidForCurrentAnalysis(self, request):
215 cur_time = self.meta_infos['start_analysis_time']
216 # Analyse not started
217 if not cur_time: return False
218 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
219
220 def hasBeenViewed(self, request):
221 return int(request['status']) in conf.viewed_http_codes
222
223 def getCurDisplayPath(self, filename):
224 cur_time = self.meta_infos['last_time']
225 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
226
227 def getResourcesPath(self):
228 return conf.resources_path
229
230 def getCSSPath(self):
231 return conf.css_path
232
233 def _clearMeta(self):
234 self.meta_infos = {
235 'last_time' : None,
236 'start_analysis_time' : None
237 }
238 return self.meta_infos
239
240 def _clearDisplay(self):
241 self.display.clear()
242return self.display
243
244 def getDBFilename(self, time):
245 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
246
247 def _openDB(self, filename, prot='r'):
248 if self.args.dont_compress:
249 return open(filename, prot)
250 else:
251 return gzip.open(filename, prot)
252
253 def _serialize(self, obj, filename):
254 if self.dry_run: return
255 base = os.path.dirname(filename)
256 if not os.path.exists(base):
257 os.makedirs(base)
258
259 # Make a backup in case of something fails
260 if os.path.exists(filename):
261 shutil.copy(filename, filename + '.bak')
262
263 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
264 cPickle.dump(obj, f)
265 f.seek(0)
266 fzip.write(f.read())
267 os.fsync(fzip)
268 os.remove(filename + '.tmp')
269 if os.path.exists(filename + '.bak'):
270 os.remove(filename + '.bak')
271
272 def _deserialize(self, filename):
273 if not os.path.exists(filename):
274 return None
275
276 res = None
277 with self._openDB(filename) as f:
278 res = cPickle.load(f)
279 return res
280
281 def _callPlugins(self, target_root, *args):
282 self.logger.info('==> Call plugins (%s)' % (target_root))
283 for (root, plugins) in self.plugins:
284 if root != target_root: continue
285 for p in plugins:
286 mod = self.cache_plugins.get(root + '.' + p, None)
287 if mod:
288 self.logger.info('\t%s' % (p))
289 mod.hook(*args)
290
291 def isPage(self, request):
292 self.logger.debug("Is page %s" % (request))
293 for e in conf.pages_extensions:
294 if request.endswith(e):
295 self.logger.debug("True")
296 return True
297 self.logger.debug("False")
298 return False
299
300 def isMultimediaFile(self, request):
301 self.logger.debug("Is multimedia %s" % (request))
302 for e in conf.multimedia_files:
303 if request.endswith(e):
304 self.logger.debug("True")
305 return True
306 self.logger.debug("False")
307 return False
308
309 def isValidVisitor(self, hit):
310 if hit['robot']: return False
311 if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
312 return False
313 return True
314
315 def isRobot(self, hit):
316 return hit['robot']
317
318 def _appendHit(self, hit):
319 remote_addr = hit['remote_addr']
320
321 if not remote_addr: return
322
323 for ip in self.excluded_ip:
324 if ip.match(remote_addr):
325 return
326
327 if not remote_addr in self.current_analysis['visits'].keys():
328 self._createVisitor(hit)
329
330 super_hit = self.current_analysis['visits'][remote_addr]
331 # Don't keep all requests for robots
332 if not super_hit['robot']:
333 super_hit['requests'].append(hit)
334
335 day = self.meta_infos['last_time'].tm_mday
336 if self.hasBeenViewed(hit):
337 super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
338 super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
339 super_hit['last_access'] = self.meta_infos['last_time']
340
341 request = hit['extract_request']
342
343 uri = request.get('extract_uri', request['http_uri'])
344
345 hit['is_page'] = self.isPage(uri)
346
347 if super_hit['robot'] or\
348 not self.hasBeenViewed(hit):
349 page_key = 'not_viewed_pages'
350 hit_key = 'not_viewed_hits'
351 else:
352 page_key = 'viewed_pages'
353 hit_key = 'viewed_hits'
354
355 if hit['is_page']:
356 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
357 super_hit[page_key][0] += 1
358 else:
359 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
360 super_hit[hit_key][0] += 1
361
362 def _createVisitor(self, hit):
363 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
364 super_hit['remote_addr'] = hit['remote_addr']
365 super_hit['remote_ip'] = hit['remote_addr']
366 super_hit['viewed_pages'] = {0:0}
367 super_hit['viewed_hits'] = {0:0}
368 super_hit['not_viewed_pages'] = {0:0}
369 super_hit['not_viewed_hits'] = {0:0}
370 super_hit['bandwidth'] = {0:0}
371 super_hit['last_access'] = self.meta_infos['last_time']
372 super_hit['requests'] = []
373 super_hit['robot'] = False
374 super_hit['hit_only'] = 0
375
376 def _normalizeURI(self, uri):
377 if uri == '/': return uri
378 uri = self.final_slashes_re.sub('/', uri)
379 return uri
380
381 def _removeFinalSlashes(self, uri):
382 if uri == '/': return uri
383 return self.final_slashes_re.sub('', uri)
384
385 def _normalizeParameters(self, parameters):
386 # No parameters
387 if parameters == '?': return None
388 return parameters
389
390 def _decodeHTTPRequest(self, hit):
391 if not 'request' in hit.keys(): return False
392
393 groups = self.http_request_extracted.match(hit['request'])
394
395 if groups:
396 hit['extract_request'] = groups.groupdict("")
397 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
398 if uri_groups:
399 d = uri_groups.groupdict("")
400 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
401 if 'extract_parameters' in d.keys():
402 parameters = self._normalizeParameters(d['extract_parameters'])
403 if parameters:
404 hit['extract_request']['extract_parameters'] = parameters
405 else:
406 self.logger.warning("Bad request extraction %s" % (hit['request']))
407 return False
408
409 if hit['http_referer']:
410 referer_groups = self.uri_re.match(hit['http_referer'])
411 if referer_groups:
412 hit['extract_referer'] = referer_groups.groupdict("")
413 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
414 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
415 return True
416
417 def _decodeTime(self, hit):
418 try:
419 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
420 except ValueError, e:
421 if sys.version_info < (3, 2):
422 # Try without UTC value at the end (%z not recognized)
423 gmt_offset_str = hit['time_local'][-5:]
424 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
425 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
426 gmt_offset = gmt_offset_hours + gmt_offset_minutes
427 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
428 # if gmt_offset_str[0] == '-':
429 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
430 # else:
431 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
432 else:
433 raise e
434 return hit['time_decoded']
435
436 def getDisplayIndex(self):
437 cur_time = self.meta_infos['last_time']
438 filename = self.getCurDisplayPath('index.html')
439
440 return self.display.getPage(filename)
441
442 def _generateDisplayDaysStats(self):
443 cur_time = self.meta_infos['last_time']
444 title = createCurTitle(self, self._('Statistics'))
445 filename = self.getCurDisplayPath('index.html')
446 self.logger.info('==> Generate display (%s)' % (filename))
447 page = self.display.createPage(title, filename, conf.css_path)
448 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
449 page.appendBlock(link)
450
451 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
452 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
453 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
454 nb_visits = 0
455 nb_days = 0
456 for i in range(1, nb_month_days+1):
457 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
458 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
459 if i in self.current_analysis['days_stats'].keys():
460 stats = self.current_analysis['days_stats'][i]
461 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
462 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
463 nb_visits += stats['nb_visits']
464 nb_days += 1
465 else:
466 row = [full_day, 0, 0, 0, 0, 0]
467 days.appendRow(row)
468 days.setCellValue(i-1, 4, bytesToStr(row[4]))
469 days.setCellValue(i-1, 5, bytesToStr(row[5]))
470 days.appendShortTitle(day)
471 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
472 week_day = adate.weekday()
473 if week_day == 5 or week_day == 6:
474 days.setRowCSSClass(i-1, 'iwla_weekend')
475 if adate == date.today():
476 css = days.getCellCSSClass(i-1, 0)
477 if css: css = '%s %s' % (css, 'iwla_curday')
478 else: css = 'iwla_curday'
479 days.setCellCSSClass(i-1, 0, css)
480
481 stats = self.current_analysis['month_stats']
482
483 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
484 if nb_days:
485 average_row = map(lambda(v): int(v/nb_days), row)
486 else:
487 average_row = map(lambda(v): 0, row)
488
489 average_row[0] = self._('Average')
490 average_row[4] = bytesToStr(average_row[4])
491 average_row[5] = bytesToStr(average_row[5])
492 days.appendRow(average_row)
493
494 row[0] = self._('Total')
495 row[4] = bytesToStr(row[4])
496 row[5] = bytesToStr(row[5])
497 days.appendRow(row)
498 page.appendBlock(days)
499 self.display.addPage(page)
500
501 def _generateDisplayMonthStats(self, page, year, month_stats):
502 cur_time = time.localtime()
503 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
504 title = '%s %d' % (self._('Summary'), year)
505 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
506 graph_cols=range(1,7)
507 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
508 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
509 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
510 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
511 total = [0] * len(cols)
512 for i in range(1, 13):
513 month = '%s<br/>%d' % (months_name[i], year)
514 full_month = '%s %d' % (months_name[i], year)
515 if i in month_stats.keys():
516 stats = month_stats[i]
517 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
518 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
519 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
520 for j in graph_cols:
521 total[j] += row[j]
522 else:
523 row = [full_month, 0, 0, 0, 0, 0, 0, '']
524 months.appendRow(row)
525 months.setCellValue(i-1, 5, bytesToStr(row[5]))
526 months.setCellValue(i-1, 6, bytesToStr(row[6]))
527 months.appendShortTitle(month)
528 months_.appendRow(row[:-1])
529 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
530 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
531 months_.appendShortTitle(month)
532 if year == cur_time.tm_year and i == cur_time.tm_mon:
533 css = months.getCellCSSClass(i-1, 0)
534 if css: css = '%s %s' % (css, 'iwla_curday')
535 else: css = 'iwla_curday'
536 months.setCellCSSClass(i-1, 0, css)
537 months_.setCellCSSClass(i-1, 0, css)
538
539 total[0] = self._('Total')
540 total[5] = bytesToStr(total[5])
541 total[6] = bytesToStr(total[6])
542 total[7] = u''
543 months.appendRow(total)
544 page.appendBlock(months)
545
546 months_.appendRow(total[:-1])
547 filename = '%d/_stats.html' % (year)
548 page_ = self.display.createPage(u'', filename, conf.css_path)
549 page_.appendBlock(months_)
550 page_.build(conf.DISPLAY_ROOT, False)
551
552 def _generateDisplayWholeMonthStats(self):
553 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
554 filename = 'index.html'
555
556 self.logger.info('==> Generate main page (%s)' % (filename))
557
558 page = self.display.createPage(title, filename, conf.css_path)
559
560 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
561 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
562 duration = datetime.now() - self.start_time
563 duration = time.gmtime(duration.seconds)
564 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
565 if duration.tm_hour:
566 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
567 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
568 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
569
570 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
571 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
572
573 self.display.addPage(page)
574
575 def _compressFile(self, build_time, root, filename):
576 path = os.path.join(root, filename)
577 gz_path = path + '.gz'
578
579 self.logger.debug('Compress %s => %s' % (path, gz_path))
580
581 if not os.path.exists(gz_path) or\
582 os.stat(path).st_mtime >= build_time:
583 if self.dry_run: return
584 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
585 f_out.write(f_in.read())
586
587 def _compressFiles(self, build_time, root):
588 if not conf.compress_output_files: return
589 for rootdir, subdirs, files in os.walk(root, followlinks=True):
590 for f in files:
591 for ext in conf.compress_output_files:
592 if f.endswith(ext):
593 self._compressFile(build_time, rootdir, f)
594 break
595
596 def _generateDisplay(self):
597 self._generateDisplayDaysStats()
598 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
599 self._generateDisplayWholeMonthStats()
600 build_time = time.mktime(time.localtime())
601 self.display.build(conf.DISPLAY_ROOT)
602 self._compressFiles(build_time, conf.DISPLAY_ROOT)
603
604 def _createEmptyStats(self):
605 stats = {}
606 stats['viewed_bandwidth'] = 0
607 stats['not_viewed_bandwidth'] = 0
608 stats['viewed_pages'] = 0
609 stats['viewed_hits'] = 0
610 stats['nb_visits'] = 0
611
612 return stats
613
614 def _generateMonthStats(self):
615 self._clearDisplay()
616
617 visits = self.current_analysis['visits']
618
619 stats = self._createEmptyStats()
620 for (day, stat) in self.current_analysis['days_stats'].items():
621 for k in stats.keys():
622 stats[k] += stat[k]
623
624 duplicated_stats = {k:v for (k,v) in stats.items()}
625
626 cur_time = self.meta_infos['last_time']
627 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
628 self.logger.info(stats)
629
630 if not 'month_stats' in self.current_analysis.keys():
631 self.current_analysis['month_stats'] = stats
632 else:
633 for (k,v) in stats.items():
634 self.current_analysis['month_stats'][k] = v
635
636 self.valid_visitors = {}
637 for (k,v) in visits.items():
638 if self.isValidVisitor(v):
639 self.valid_visitors[k] = v
640
641 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
642
643 self._callPlugins(conf.POST_HOOK_DIRECTORY)
644
645 if args.display_only:
646 self._generateDisplay()
647 return
648
649 path = self.getDBFilename(cur_time)
650
651 self.logger.info("==> Serialize to %s" % (path))
652 self._serialize(self.current_analysis, path)
653
654 # Save month stats
655 year = cur_time.tm_year
656 month = cur_time.tm_mon
657 if not 'stats' in self.meta_infos.keys():
658 self.meta_infos['stats'] = {}
659 if not year in self.meta_infos['stats'].keys():
660 self.meta_infos['stats'][year] = {}
661 self.meta_infos['stats'][year][month] = duplicated_stats
662
663 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
664 self._serialize(self.meta_infos, conf.META_PATH)
665
666 self._generateDisplay()
667
668 def _generateDayStats(self):
669 if args.display_only:
670 return
671
672 visits = self.current_analysis['visits']
673 cur_time = self.meta_infos['last_time']
674
675 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
676
677 stats = self._createEmptyStats()
678
679 day = cur_time.tm_mday
680 for (k, super_hit) in visits.items():
681 if super_hit['last_access'].tm_mday != day:
682 continue
683 if super_hit['robot']:
684 stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
685 continue
686 stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
687 stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
688 stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
689 if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
690 super_hit['viewed_pages'].get(day, 0)):
691 stats['nb_visits'] += 1
692
693 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
694 self.logger.info(stats)
695
696 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
697
698 def _newHit(self, hit):
699 if not self.domain_name_re.match(hit['server_name']):
700 self.logger.debug("Not in domain %s" % (hit))
701 return False
702
703 t = self._decodeTime(hit)
704
705 cur_time = self.meta_infos['last_time']
706
707 if cur_time == None:
708 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
709 self.analyse_started = True
710 else:
711 if not self.analyse_started and\
712 time.mktime(t) <= time.mktime(cur_time):
713 self.logger.debug("Not in time")
714 return False
715 self.analyse_started = True
716 if t < cur_time: # Don't accept past hits
717 return False
718 if cur_time.tm_mon != t.tm_mon:
719 self._generateDayStats()
720 self._generateMonthStats()
721 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
722 elif cur_time.tm_mday != t.tm_mday:
723 self._generateDayStats()
724
725 self.meta_infos['last_time'] = t
726
727 if not self.meta_infos['start_analysis_time']:
728 self.meta_infos['start_analysis_time'] = t
729
730 if not self._decodeHTTPRequest(hit): return False
731
732 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
733 return False
734
735 for k in hit.keys():
736 if hit[k] == '-' or hit[k] == '*':
737 hit[k] = ''
738
739 self._appendHit(hit)
740
741 return True
742
743 def _reset(self):
744 reset_time = time.strptime(self.args.reset, '%m/%Y')
745
746 self.logger.info('Reset time')
747 self.logger.info(reset_time)
748
749 self.meta_infos['last_time'] = reset_time
750
751 cur_time = time.localtime()
752 year = reset_time.tm_year
753 while year < cur_time.tm_year:
754 db_path = os.path.join(conf.DB_ROOT, str(year))
755 if os.path.exists(db_path): shutil.rmtree(db_path)
756 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
757 if os.path.exists(output_path): shutil.rmtree(output_path)
758 year += 1
759 month = reset_time.tm_mon
760 while month <= cur_time.tm_mon:
761 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
762 if os.path.exists(db_path): shutil.rmtree(db_path)
763 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
764 if os.path.exists(output_path): shutil.rmtree(output_path)
765 month += 1
766
767 def start(self, _file, args):
768 self.args = args
769 self.start_time = datetime.now()
770
771 self.logger.info('==> Load previous database')
772
773 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
774 if self.meta_infos['last_time']:
775 if args.reset:
776 self._reset()
777 self.logger.info('Last time')
778 self.logger.info(self.meta_infos['last_time'])
779 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
780 else:
781 self._clearVisits()
782
783 self.meta_infos['start_analysis_time'] = None
784
785 self.cache_plugins = preloadPlugins(self.plugins, self)
786
787 self.logger.info('==> Analysing log')
788
789 for l in _file:
790 # print "line " + l
791
792 groups = self.log_re.match(l)
793
794 if groups:
795 self._newHit(groups.groupdict(""))
796 else:
797 self.logger.warning("No match for %s" % (l))
798 #break
799
800 if self.analyse_started:
801 self._generateDayStats()
802 self._generateMonthStats()
803 del self.meta_infos['start_analysis_time']
804 else:
805 self.logger.info('==> Analyse not started : nothing new')
806
807
808class FileIter(object):
809 def __init__(self, filenames):
810 self.filenames = [f for f in filenames.split(',') if f]
811 for f in self.filenames:
812 if not os.path.exists(f):
813 print 'No such file \'%s\'' % (f)
814 sys.exit(-1)
815 self.cur_file = None
816 self._openNextFile()
817
818 def __iter__(self):
819 return self
820
821 def __next__(self):
822 return self.next()
823
824 def _openNextFile(self):
825 if self.cur_file:
826 self.cur_file.close()
827 self.cur_file = None
828 if not self.filenames:
829 raise StopIteration()
830 filename = self.filenames.pop(0)
831 if filename.endswith('gz'):
832 self.cur_file = gzip.open(filename, 'r')
833 else:
834 self.cur_file = open(filename)
835
836 def next(self):
837 l = self.cur_file.readline()
838 if not l:
839 self._openNextFile()
840 l = self.cur_file.readline()
841 return l[:-1]
842
843if __name__ == '__main__':
844 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
845
846 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
847 default=False,
848 help='Clean output before starting')
849
850 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
851 default=False,
852 help='Read data from stdin instead of conf.analyzed_filename')
853
854 parser.add_argument('-f', '--file', dest='file',
855 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
856
857 parser.add_argument('-d', '--log-level', dest='loglevel',
858 default='INFO', type=str,
859 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
860
861 parser.add_argument('-r', '--reset', dest='reset',
862 default=False,
863 help='Reset analysis to a specific date (month/year)')
864
865 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
866 default=False,
867 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
868
869 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
870 default=False,
871 help='Only generate display')
872
873 parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
874 default=False,
875 help='Process log but don\'t write files (database and HTML) to disk')
876
877 args = parser.parse_args()
878
879 # Load user conf
880 for (k,v) in user_conf.__dict__.items():
881 if k.endswith('_append'):
882 new_k = k[:-7]
883 if new_k in dir(conf):
884 if type(conf.__dict__[new_k]) == list:
885 if type(v) == list:
886 conf.__dict__[new_k] += v
887 else:
888 conf.__dict__[new_k].append(v)
889 else:
890 print("Error %s is not a list" % (new_k))
891 else:
892 print("Error %s doesn't exists in default conf" % (new_k))
893 else:
894 conf.__dict__.update({k:v})
895
896 if args.clean_output and not args.dry_run:
897 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
898 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
899
900 loglevel = getattr(logging, args.loglevel.upper(), None)
901 if not isinstance(loglevel, int):
902 raise ValueError('Invalid log level: %s' % (args.loglevel))
903
904 iwla = IWLA(loglevel, args.dry_run)
905
906 required_conf = ['analyzed_filename', 'domain_name']
907 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
908 sys.exit(0)
909
910 if args.stdin:
911 iwla.start(sys.stdin, args)
912 else:
913 filename = args.file or conf.analyzed_filename
914 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags