iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files
55 excluded_ip
56
57Output files :
58 DB_ROOT/meta.db
59 DB_ROOT/year/month/iwla.db
60 OUTPUT_ROOT/index.html
61 OUTPUT_ROOT/year/_stats.html
62 OUTPUT_ROOT/year/month/index.html
63
64Statistics creation :
65
66meta :
67 last_time
68 start_analysis_time
69 stats =>
70 year =>
71 month =>
72 viewed_bandwidth
73 not_viewed_bandwidth
74 viewed_pages
75 viewed_hits
76 nb_visits
77 nb_visitors
78
79month_stats :
80 viewed_bandwidth
81 not_viewed_bandwidth
82 viewed_pages
83 viewed_hits
84 nb_visits
85
86days_stats :
87 day =>
88 viewed_bandwidth
89 not_viewed_bandwidth
90 viewed_pages
91 viewed_hits
92 nb_visits
93 nb_visitors
94
95visits :
96 remote_addr =>
97 remote_addr
98 remote_ip
99 viewed_pages{0..31} # 0 contains total
100 viewed_hits{0..31} # 0 contains total
101 not_viewed_pages{0..31}
102 not_viewed_hits{0..31}
103 bandwidth{0..31}
104 last_access
105 requests =>
106 [fields_from_format_log]
107 extract_request =>
108 http_method
109 http_uri
110 http_version
111 extract_uri
112 extract_parameters*
113 extract_referer* =>
114 extract_uri
115 extract_parameters*
116 robot
117 hit_only
118 is_page
119
120valid_visitors:
121 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
122
123Statistics update :
124 None
125
126Statistics deletion :
127 None
128"""
129
130
131class IWLA(object):
132
133 ANALYSIS_CLASS = 'HTTP'
134 API_VERSION = 1
135 IWLA_VERSION = '0.6'
136
137 def __init__(self, logLevel, dry_run):
138 self.meta_infos = {}
139 self.analyse_started = False
140 self.current_analysis = {}
141 self.start_time = 0
142 self.cache_plugins = {}
143 self.display = DisplayHTMLBuild(self)
144 self.valid_visitors = None
145 self.dry_run = dry_run
146
147 self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
148 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
149 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
150 self.log_re = re.compile(self.log_format_extracted)
151 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
152 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
153 self.final_slashes_re = re.compile(r'/+$')
154 self.excluded_ip = []
155 for ip in conf.excluded_ip:
156 self.excluded_ip += [re.compile(ip)]
157 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
158 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
159 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
160
161 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
162 self.logger = logging.getLogger(self.__class__.__name__)
163 if self.dry_run:
164 self.logger.info('==> Start (DRY RUN)')
165 else:
166 self.logger.info('==> Start')
167 try:
168 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale])
169 self.logger.info('\tUsing locale %s' % (conf.locale))
170 except IOError:
171 t = gettext.NullTranslations()
172 self.logger.info('\tUsing default locale en_EN')
173 self._ = t.gettext
174
175 def getVersion(self):
176 return IWLA.IWLA_VERSION
177
178 def getConfValue(self, key, default=None):
179 if not key in dir(conf):
180 return default
181 else:
182 return conf.__dict__[key]
183
184 def _clearVisits(self):
185 self.current_analysis = {
186 'days_stats' : {},
187 'month_stats' : {},
188 'visits' : {}
189 }
190 self.valid_visitors = None
191 return self.current_analysis
192
193 def getDaysStats(self):
194 return self.current_analysis['days_stats']
195
196 def getMonthStats(self):
197 return self.current_analysis['month_stats']
198
199 def getCurrentVisits(self):
200 return self.current_analysis['visits']
201
202 def getValidVisitors(self):
203 return self.valid_visitors
204
205 def getDisplay(self):
206 return self.display
207
208 def getCurTime(self):
209 return self.meta_infos['last_time']
210
211 def getStartAnalysisTime(self):
212 return self.meta_infos['start_analysis_time']
213
214 def isValidForCurrentAnalysis(self, request):
215 cur_time = self.meta_infos['start_analysis_time']
216 # Analyse not started
217 if not cur_time: return False
218 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
219
220 def hasBeenViewed(self, request):
221 return int(request['status']) in conf.viewed_http_codes
222
223 def getCurDisplayPath(self, filename):
224 cur_time = self.meta_infos['last_time']
225 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
226
227 def getResourcesPath(self):
228 return conf.resources_path
229
230 def getCSSPath(self):
231 return conf.css_path
232
233 def _clearMeta(self):
234 self.meta_infos = {
235 'last_time' : None,
236 'start_analysis_time' : None
237 }
238 return self.meta_infos
239
240 def _clearDisplay(self):
241 self.display.clear()
242 return self.display
243
244 def getDBFilename(self, time):
245 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
246
247 def _openDB(self, filename, prot='r'):
248 if self.args.dont_compress:
249 return open(filename, prot)
250 else:
251 return gzip.open(filename, prot)
252
253 def _serialize(self, obj, filename):
254 if self.dry_run: return
255 base = os.path.dirname(filename)
256 if not os.path.exists(base):
257 os.makedirs(base)
258
259 # Make a backup in case of something fails
260 if os.path.exists(filename):
261 shutil.copy(filename, filename + '.bak')
262
263 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
264 pickle.dump(obj, f)
265 f.seek(0)
266 fzip.write(f.read())
267 os.fsync(fzip)
268 os.remove(filename + '.tmp')
269 if os.path.exists(filename + '.bak'):
270 os.remove(filename + '.bak')
271
272 def _deserialize(self, filename):
273 if not os.path.exists(filename):
274 return None
275
276 res = None
277 with self._openDB(filename) as f:
278 res = pickle.load(f)
279 return res
280
281 def _callPlugins(self, target_root, *args):
282 self.logger.info('==> Call plugins (%s)' % (target_root))
283 for (root, plugins) in self.plugins:
284 if root != target_root: continue
285 for p in plugins:
286 mod = self.cache_plugins.get(root + '.' + p, None)
287 if mod:
288 self.logger.info('\t%s' % (p))
289 mod.hook(*args)
290
291 def isPage(self, request):
292 self.logger.debug("Is page %s" % (request))
293 for e in conf.pages_extensions:
294 if request.endswith(e):
295 self.logger.debug("True")
296 return True
297 self.logger.debug("False")
298 return False
299
300 def isMultimediaFile(self, request):
301 self.logger.debug("Is multimedia %s" % (request))
302 for e in conf.multimedia_files:
303 if request.endswith(e):
304 self.logger.debug("True")
305 return True
306 self.logger.debug("False")
307 return False
308
309 def isValidVisitor(self, hit):
310 if hit['robot']: return False
311 if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
312 return False
313 return True
314
315 def isRobot(self, hit):
316 return hit['robot']
317
318 def _appendHit(self, hit):
319 remote_addr = hit['remote_addr']
320
321 if not remote_addr: return
322
323 for ip in self.excluded_ip:
324 if ip.match(remote_addr):
325 return
326
327 if not remote_addr in self.current_analysis['visits'].keys():
328 self._createVisitor(hit)
329
330 super_hit = self.current_analysis['visits'][remote_addr]
331 # Don't keep all requests for robots
332 if not super_hit['robot']:
333 super_hit['requests'].append(hit)
334
335 day = self.meta_infos['last_time'].tm_mday
336 if self.hasBeenViewed(hit):
337 super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
338 super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
339 super_hit['last_access'] = self.meta_infos['last_time']
340
341 request = hit['extract_request']
342
343 uri = request.get('extract_uri', request['http_uri'])
344
345 hit['is_page'] = self.isPage(uri)
346
347 if super_hit['robot'] or\
348 not self.hasBeenViewed(hit):
349 page_key = 'not_viewed_pages'
350 hit_key = 'not_viewed_hits'
351 else:
352 page_key = 'viewed_pages'
353 hit_key = 'viewed_hits'
354
355 if hit['is_page']:
356 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
357 super_hit[page_key][0] += 1
358 else:
359 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
360 super_hit[hit_key][0] += 1
361
362 def _createVisitor(self, hit):
363 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
364 super_hit['remote_addr'] = hit['remote_addr']
365 super_hit['remote_ip'] = hit['remote_addr']
366 super_hit['viewed_pages'] = {0:0}
367 super_hit['viewed_hits'] = {0:0}
368 super_hit['not_viewed_pages'] = {0:0}
369 super_hit['not_viewed_hits'] = {0:0}
370 super_hit['bandwidth'] = {0:0}
371 super_hit['last_access'] = self.meta_infos['last_time']
372 super_hit['requests'] = []
373 super_hit['robot'] = False
374 super_hit['hit_only'] = 0
375
376 def _normalizeURI(self, uri):
377 if uri == '/': return uri
378 uri = self.final_slashes_re.sub('/', uri)
379 return uri
380
381 def _removeFinalSlashes(self, uri):
382 if uri == '/': return uri
383 return self.final_slashes_re.sub('', uri)
384
385 def _normalizeParameters(self, parameters):
386 # No parameters
387 if parameters == '?': return None
388 return parameters
389
390 def _decodeHTTPRequest(self, hit):
391 if not 'request' in hit.keys(): return False
392
393 groups = self.http_request_extracted.match(hit['request'])
394
395 if groups:
396 hit['extract_request'] = groups.groupdict("")
397 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
398 if uri_groups:
399 d = uri_groups.groupdict("")
400 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
401 if 'extract_parameters' in d.keys():
402 parameters = self._normalizeParameters(d['extract_parameters'])
403 if parameters:
404 hit['extract_request']['extract_parameters'] = parameters
405 else:
406 self.logger.warning("Bad request extraction %s" % (hit['request']))
407 return False
408
409 if hit['http_referer']:
410 referer_groups = self.uri_re.match(hit['http_referer'])
411 if referer_groups:
412 hit['extract_referer'] = referer_groups.groupdict("")
413 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
414 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
415 return True
416
417 def _decodeTime(self, hit):
418 try:
419 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
420 except ValueError as e:
421 if sys.version_info < (3, 2):
422 # Try without UTC value at the end (%z not recognized)
423 gmt_offset_str = hit['time_local'][-5:]
424 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
425 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
426 gmt_offset = gmt_offset_hours + gmt_offset_minutes
427 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
428 # if gmt_offset_str[0] == '-':
429 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
430 # else:
431 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
432 else:
433 raise e
434 return hit['time_decoded']
435
436 def getDisplayIndex(self):
437 cur_time = self.meta_infos['last_time']
438 filename = self.getCurDisplayPath('index.html')
439
440 return self.display.getPage(filename)
441
442 def _generateDisplayDaysStats(self):
443 cur_time = self.meta_infos['last_time']
444 title = createCurTitle(self, self._('Statistics'))
445 filename = self.getCurDisplayPath('index.html')
446 self.logger.info('==> Generate display (%s)' % (filename))
447 page = self.display.createPage(title, filename, conf.css_path)
448 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
449 page.appendBlock(link)
450
451 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
452 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6), [4, 5])
453 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
454 nb_visits = 0
455 nb_days = 0
456 for i in range(1, nb_month_days+1):
457 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
458 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
459 if i in self.current_analysis['days_stats'].keys():
460 stats = self.current_analysis['days_stats'][i]
461 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
462 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
463 nb_visits += stats['nb_visits']
464 nb_days += 1
465 else:
466 row = [full_day, 0, 0, 0, 0, 0]
467 days.appendRow(row)
468 viewed_bandwidth = row[4]
469 not_viewed_bandwidth = row[5]
470 days.setCellValue(i-1, 4, viewed_bandwidth)
471 days.setCellValue(i-1, 5, not_viewed_bandwidth)
472 days.appendShortTitle(day)
473 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
474 week_day = adate.weekday()
475 if week_day == 5 or week_day == 6:
476 days.setRowCSSClass(i-1, 'iwla_weekend')
477 if adate == date.today():
478 css = days.getCellCSSClass(i-1, 0)
479 if css: css = '%s %s' % (css, 'iwla_curday')
480 else: css = 'iwla_curday'
481 days.setCellCSSClass(i-1, 0, css)
482
483 stats = self.current_analysis['month_stats']
484
485 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
486 if nb_days:
487 average_row = list(map(lambda v: int(v/nb_days), row))
488 else:
489 average_row = list(map(lambda v: 0, row))
490
491 average_row[0] = self._('Average')
492 days.appendRow(average_row)
493
494 row[0] = self._('Total')
495 days.appendRow(row)
496 page.appendBlock(days)
497 self.display.addPage(page)
498
499 def _generateDisplayMonthStats(self, page, year, month_stats):
500 cur_time = time.localtime()
501 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
502 title = '%s %d' % (self._('Summary'), year)
503 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
504 graph_cols=range(1,7)
505 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols, [5, 6])
506 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
507 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1], [5, 6])
508 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
509 total = [0] * len(cols)
510 for i in range(1, 13):
511 month = '%s<br/>%d' % (months_name[i], year)
512 full_month = '%s %d' % (months_name[i], year)
513 if i in month_stats.keys():
514 stats = month_stats[i]
515 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
516 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
517 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
518 for j in graph_cols:
519 total[j] += row[j]
520 else:
521 row = [full_month, 0, 0, 0, 0, 0, 0, '']
522 months.appendRow(row)
523 viewed_bandwidth = row[5]
524 not_viewed_bandwidth = row[6]
525 months.setCellValue(i-1, 5, viewed_bandwidth)
526 months.setCellValue(i-1, 6, not_viewed_bandwidth)
527 months.appendShortTitle(month)
528 months_.appendRow(row[:-1])
529 months_.setCellValue(i-1, 5, viewed_bandwidth)
530 months_.setCellValue(i-1, 6, not_viewed_bandwidth)
531 months_.appendShortTitle(month)
532 if year == cur_time.tm_year and i == cur_time.tm_mon:
533 css = months.getCellCSSClass(i-1, 0)
534 if css: css = '%s %s' % (css, 'iwla_curday')
535 else: css = 'iwla_curday'
536 months.setCellCSSClass(i-1, 0, css)
537 months_.setCellCSSClass(i-1, 0, css)
538
539 total[0] = self._('Total')
540 total[7] = u''
541 months.appendRow(total)
542 page.appendBlock(months)
543
544 months_.appendRow(total[:-1])
545 filename = '%d/_stats.html' % (year)
546 page_ = self.display.createPage(u'', filename, conf.css_path)
547 page_.appendBlock(months_)
548 page_.build(conf.DISPLAY_ROOT, False)
549
550 def _generateDisplayWholeMonthStats(self):
551 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
552 filename = 'index.html'
553
554 self.logger.info('==> Generate main page (%s)' % (filename))
555
556 page = self.display.createPage(title, filename, conf.css_path)
557
558 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
559 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
560 duration = datetime.now() - self.start_time
561 duration = time.gmtime(duration.seconds)
562 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
563 if duration.tm_hour:
564 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
565 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
566 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
567
568 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
569 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
570
571 self.display.addPage(page)
572
573 def _compressFile(self, root, filename):
574 path = os.path.join(root, filename)
575 gz_path = path + '.gz'
576
577 self.logger.debug('Compress %s => %s' % (path, gz_path))
578
579 if not os.path.exists(gz_path) or\
580 os.stat(path).st_mtime > os.stat(gz_path).st_mtime:
581 if self.dry_run: return
582 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
583 f_out.write(f_in.read())
584
585 def _compressFiles(self, root):
586 if not conf.compress_output_files: return
587 for rootdir, subdirs, files in os.walk(root, followlinks=True):
588 for f in files:
589 for ext in conf.compress_output_files:
590 if f.endswith(ext):
591 self._compressFile(rootdir, f)
592 break
593
594 def _generateDisplay(self):
595 self._generateDisplayDaysStats()
596 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
597 self._generateDisplayWholeMonthStats()
598 self.display.build(conf.DISPLAY_ROOT)
599 self._compressFiles(conf.DISPLAY_ROOT)
600
601 def _createEmptyStats(self):
602 stats = {}
603 stats['viewed_bandwidth'] = 0
604 stats['not_viewed_bandwidth'] = 0
605 stats['viewed_pages'] = 0
606 stats['viewed_hits'] = 0
607 stats['nb_visits'] = 0
608
609 return stats
610
611 def _generateMonthStats(self):
612 self._clearDisplay()
613
614 visits = self.current_analysis['visits']
615
616 stats = self._createEmptyStats()
617 for (day, stat) in self.current_analysis['days_stats'].items():
618 for k in stats.keys():
619 stats[k] += stat[k]
620
621 duplicated_stats = {k:v for (k,v) in stats.items()}
622
623 cur_time = self.meta_infos['last_time']
624 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
625 self.logger.info(stats)
626
627 if not 'month_stats' in self.current_analysis.keys():
628 self.current_analysis['month_stats'] = stats
629 else:
630 for (k,v) in stats.items():
631 self.current_analysis['month_stats'][k] = v
632
633 self.valid_visitors = {}
634 for (k,v) in visits.items():
635 if self.isValidVisitor(v):
636 self.valid_visitors[k] = v
637
638 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
639
640 self._callPlugins(conf.POST_HOOK_DIRECTORY)
641
642 if args.display_only:
643 if not 'stats' in self.meta_infos.keys():
644 self.meta_infos['stats'] = {}
645 self._generateDisplay()
646 return
647
648 path = self.getDBFilename(cur_time)
649
650 self.logger.info("==> Serialize to %s" % (path))
651 self._serialize(self.current_analysis, path)
652
653 # Save month stats
654 year = cur_time.tm_year
655 month = cur_time.tm_mon
656 if not 'stats' in self.meta_infos.keys():
657 self.meta_infos['stats'] = {}
658 if not year in self.meta_infos['stats'].keys():
659 self.meta_infos['stats'][year] = {}
660 self.meta_infos['stats'][year][month] = duplicated_stats
661
662 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
663 self._serialize(self.meta_infos, conf.META_PATH)
664
665 self._generateDisplay()
666
667 def _generateDayStats(self):
668 if args.display_only:
669 return
670
671 visits = self.current_analysis['visits']
672 cur_time = self.meta_infos['last_time']
673
674 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
675
676 stats = self._createEmptyStats()
677
678 day = cur_time.tm_mday
679 for (k, super_hit) in visits.items():
680 if super_hit['last_access'].tm_mday != day:
681 continue
682 if super_hit['robot']:
683 stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
684 continue
685 stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
686 stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
687 stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
688 if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
689 super_hit['viewed_pages'].get(day, 0)):
690 stats['nb_visits'] += 1
691
692 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
693 self.logger.info(stats)
694
695 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
696
697 def _newHit(self, hit):
698 if not self.domain_name_re.match(hit['server_name']):
699 self.logger.debug("Not in domain %s" % (hit))
700 return False
701
702 t = self._decodeTime(hit)
703
704 cur_time = self.meta_infos['last_time']
705
706 if cur_time == None:
707 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
708 self.analyse_started = True
709 else:
710 if not self.analyse_started and\
711 time.mktime(t) <= time.mktime(cur_time):
712 self.logger.debug("Not in time")
713 return False
714 self.analyse_started = True
715 if t < cur_time: # Don't accept past hits
716 return False
717 if cur_time.tm_mon != t.tm_mon:
718 self._generateDayStats()
719 self._generateMonthStats()
720 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
721 elif cur_time.tm_mday != t.tm_mday:
722 self._generateDayStats()
723
724 self.meta_infos['last_time'] = t
725
726 if not self.meta_infos['start_analysis_time']:
727 self.meta_infos['start_analysis_time'] = t
728
729 if not self._decodeHTTPRequest(hit): return False
730
731 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
732 return False
733
734 for k in hit.keys():
735 if hit[k] == '-' or hit[k] == '*':
736 hit[k] = ''
737
738 self._appendHit(hit)
739
740 return True
741
742 def _reset(self):
743 reset_time = time.strptime(self.args.reset, '%m/%Y')
744
745 self.logger.info('Reset time')
746 self.logger.info(reset_time)
747
748 self.meta_infos['last_time'] = reset_time
749
750 cur_time = time.localtime()
751 year = reset_time.tm_year
752 while year < cur_time.tm_year:
753 db_path = os.path.join(conf.DB_ROOT, str(year))
754 if os.path.exists(db_path): shutil.rmtree(db_path)
755 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
756 if os.path.exists(output_path): shutil.rmtree(output_path)
757 year += 1
758 month = reset_time.tm_mon
759 while month <= cur_time.tm_mon:
760 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
761 if os.path.exists(db_path): shutil.rmtree(db_path)
762 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
763 if os.path.exists(output_path): shutil.rmtree(output_path)
764 month += 1
765
766 def start(self, _file, args):
767 self.args = args
768 self.start_time = datetime.now()
769
770 self.logger.info('==> Load previous database')
771
772 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
773 if self.meta_infos['last_time']:
774 if args.reset:
775 self._reset()
776 self.logger.info('Last time')
777 self.logger.info(self.meta_infos['last_time'])
778 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
779 else:
780 self._clearVisits()
781
782 self.meta_infos['start_analysis_time'] = None
783
784 self.cache_plugins = preloadPlugins(self.plugins, self)
785
786 self.logger.info('==> Analysing log')
787
788 for l in _file:
789 # print "line " + l
790
791 groups = self.log_re.match(l)
792
793 if groups:
794 self._newHit(groups.groupdict(""))
795 else:
796 self.logger.warning("No match for %s" % (l))
797 #break
798
799 if self.analyse_started:
800 self._generateDayStats()
801 self._generateMonthStats()
802 del self.meta_infos['start_analysis_time']
803 else:
804 self.logger.info('==> Analyse not started : nothing new')
805
806
807class FileIter(object):
808 def __init__(self, filenames):
809 self.filenames = [f for f in filenames.split(',') if f]
810 for f in self.filenames:
811 if not os.path.exists(f):
812 print('No such file \'%s\'' % (f))
813 sys.exit(-1)
814 self.cur_file = None
815 self._openNextFile()
816
817 def __iter__(self):
818 return self
819
820 def __next__(self):
821 return self.next()
822
823 def _openNextFile(self):
824 if self.cur_file:
825 self.cur_file.close()
826 self.cur_file = None
827 if not self.filenames:
828 raise StopIteration()
829 filename = self.filenames.pop(0)
830 if filename.endswith('gz'):
831 self.cur_file = gzip.open(filename, 'r')
832 else:
833 self.cur_file = open(filename)
834
835 def next(self):
836 l = self.cur_file.readline()
837 if not l:
838 self._openNextFile()
839 l = self.cur_file.readline()
840 return l[:-1]
841
842if __name__ == '__main__':
843 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
844
845 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
846 default=False,
847 help='Clean output before starting')
848
849 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
850 default=False,
851 help='Read data from stdin instead of conf.analyzed_filename')
852
853 parser.add_argument('-f', '--file', dest='file',
854 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
855
856 parser.add_argument('-d', '--log-level', dest='loglevel',
857 default='INFO', type=str,
858 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
859
860 parser.add_argument('-r', '--reset', dest='reset',
861 default=False,
862 help='Reset analysis to a specific date (month/year)')
863
864 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
865 default=False,
866 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
867
868 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
869 default=False,
870 help='Only generate display')
871
872 parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
873 default=False,
874 help='Process log but don\'t write files (database and HTML) to disk')
875
876 args = parser.parse_args()
877
878 # Load user conf
879 for (k,v) in user_conf.__dict__.items():
880 if k.endswith('_append'):
881 new_k = k[:-7]
882 if new_k in dir(conf):
883 if type(conf.__dict__[new_k]) == list:
884 if type(v) == list:
885 conf.__dict__[new_k] += v
886 else:
887 conf.__dict__[new_k].append(v)
888 else:
889 print("Error %s is not a list" % (new_k))
890 else:
891 print("Error %s doesn't exists in default conf" % (new_k))
892 else:
893 conf.__dict__.update({k:v})
894
895 if args.clean_output and not args.dry_run:
896 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
897 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
898
899 loglevel = getattr(logging, args.loglevel.upper(), None)
900 if not isinstance(loglevel, int):
901 raise ValueError('Invalid log level: %s' % (args.loglevel))
902
903 iwla = IWLA(loglevel, args.dry_run)
904
905 required_conf = ['analyzed_filename', 'domain_name']
906 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
907 sys.exit(0)
908
909 if args.stdin:
910 iwla.start(sys.stdin, args)
911 else:
912 filename = args.file or conf.analyzed_filename
913 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags