iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files
55 excluded_ip
56
57Output files :
58 DB_ROOT/meta.db
59 DB_ROOT/year/month/iwla.db
60 OUTPUT_ROOT/index.html
61 OUTPUT_ROOT/year/_stats.html
62 OUTPUT_ROOT/year/month/index.html
63
64Statistics creation :
65
66meta :
67 last_time
68 start_analysis_time
69 stats =>
70 year =>
71 month =>
72 viewed_bandwidth
73 not_viewed_bandwidth
74 viewed_pages
75 viewed_hits
76 nb_visits
77 nb_visitors
78
79month_stats :
80 viewed_bandwidth
81 not_viewed_bandwidth
82 viewed_pages
83 viewed_hits
84 nb_visits
85
86days_stats :
87 day =>
88 viewed_bandwidth
89 not_viewed_bandwidth
90 viewed_pages
91 viewed_hits
92 nb_visits
93 nb_visitors
94
95visits :
96 remote_addr =>
97 remote_addr
98 remote_ip
99 viewed_pages{0..31} # 0 contains total
100 viewed_hits{0..31} # 0 contains total
101 not_viewed_pages{0..31}
102 not_viewed_hits{0..31}
103 bandwidth{0..31}
104 last_access
105 requests =>
106 [fields_from_format_log]
107 extract_request =>
108 http_method
109 http_uri
110 http_version
111 extract_uri
112 extract_parameters*
113 extract_referer* =>
114 extract_uri
115 extract_parameters*
116 robot
117 hit_only
118 is_page
119
120valid_visitors:
121 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
122
123Statistics update :
124 None
125
126Statistics deletion :
127 None
128"""
129
130
131class IWLA(object):
132
133 ANALYSIS_CLASS = 'HTTP'
134 API_VERSION = 1
135 IWLA_VERSION = '0.6'
136
137 def __init__(self, logLevel, dry_run):
138 self.meta_infos = {}
139 self.analyse_started = False
140 self.current_analysis = {}
141 self.start_time = 0
142 self.cache_plugins = {}
143 self.display = DisplayHTMLBuild(self)
144 self.valid_visitors = None
145 self.dry_run = dry_run
146
147 self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
148 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
149 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
150 self.log_re = re.compile(self.log_format_extracted)
151 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
152 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
153 self.final_slashes_re = re.compile(r'/+$')
154 self.excluded_ip = []
155 for ip in conf.excluded_ip:
156 self.excluded_ip += [re.compile(ip)]
157 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
158 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
159 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
160
161 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
162 self.logger = logging.getLogger(self.__class__.__name__)
163 if self.dry_run:
164 self.logger.info('==> Start (DRY RUN)')
165 else:
166 self.logger.info('==> Start')
167 try:
168 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale])
169 self.logger.info('\tUsing locale %s' % (conf.locale))
170 except IOError:
171 t = gettext.NullTranslations()
172 self.logger.info('\tUsing default locale en_EN')
173 self._ = t.gettext
174
175 def getVersion(self):
176 return IWLA.IWLA_VERSION
177
178 def getConfValue(self, key, default=None):
179 if not key in dir(conf):
180 return default
181 else:
182 return conf.__dict__[key]
183
184 def _clearVisits(self):
185 self.current_analysis = {
186 'days_stats' : {},
187 'month_stats' : {},
188 'visits' : {}
189 }
190 self.valid_visitors = None
191 return self.current_analysis
192
193 def getDaysStats(self):
194 return self.current_analysis['days_stats']
195
196 def getMonthStats(self):
197 return self.current_analysis['month_stats']
198
199 def getCurrentVisits(self):
200 return self.current_analysis['visits']
201
202 def getValidVisitors(self):
203 return self.valid_visitors
204
205 def getDisplay(self):
206 return self.display
207
208 def getCurTime(self):
209 return self.meta_infos['last_time']
210
211 def getStartAnalysisTime(self):
212 return self.meta_infos['start_analysis_time']
213
214 def isValidForCurrentAnalysis(self, request):
215 cur_time = self.meta_infos['start_analysis_time']
216 # Analyse not started
217 if not cur_time: return False
218 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
219
220 def hasBeenViewed(self, request):
221 return int(request['status']) in conf.viewed_http_codes
222
223 def getCurDisplayPath(self, filename):
224 cur_time = self.meta_infos['last_time']
225 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
226
227 def getResourcesPath(self):
228 return conf.resources_path
229
230 def getCSSPath(self):
231 return conf.css_path
232
233 def _clearMeta(self):
234 self.meta_infos = {
235 'last_time' : None,
236 'start_analysis_time' : None
237 }
238 return self.meta_infos
239
240 def _clearDisplay(self):
241 self.display.clear()
242 return self.display
243
244 def getDBFilename(self, time):
245 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
246
247 def _openDB(self, filename, prot='r'):
248 if self.args.dont_compress:
249 return open(filename, prot)
250 else:
251 return gzip.open(filename, prot)
252
253 def _serialize(self, obj, filename):
254 if self.dry_run: return
255 base = os.path.dirname(filename)
256 if not os.path.exists(base):
257 os.makedirs(base)
258
259 # Make a backup in case of something fails
260 if os.path.exists(filename):
261 shutil.copy(filename, filename + '.bak')
262
263 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
264 pickle.dump(obj, f)
265 f.seek(0)
266 fzip.write(f.read())
267 os.fsync(fzip)
268 os.remove(filename + '.tmp')
269 if os.path.exists(filename + '.bak'):
270 os.remove(filename + '.bak')
271
272 def _deserialize(self, filename):
273 if not os.path.exists(filename):
274 return None
275
276 res = None
277 with self._openDB(filename) as f:
278 res = pickle.load(f)
279 return res
280
281 def _callPlugins(self, target_root, *args):
282 self.logger.info('==> Call plugins (%s)' % (target_root))
283 for (root, plugins) in self.plugins:
284 if root != target_root: continue
285 for p in plugins:
286 mod = self.cache_plugins.get(root + '.' + p, None)
287 if mod:
288 self.logger.info('\t%s' % (p))
289 mod.hook(*args)
290
291 def isPage(self, request):
292 self.logger.debug("Is page %s" % (request))
293 for e in conf.pages_extensions:
294 if request.endswith(e):
295 self.logger.debug("True")
296 return True
297 self.logger.debug("False")
298 return False
299
300 def isMultimediaFile(self, request):
301 self.logger.debug("Is multimedia %s" % (request))
302 for e in conf.multimedia_files:
303 if request.endswith(e):
304 self.logger.debug("True")
305 return True
306 self.logger.debug("False")
307 return False
308
309 def isValidVisitor(self, hit):
310 if hit['robot']: return False
311 if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
312 return False
313 return True
314
315 def isRobot(self, hit):
316 return hit['robot']
317
318 def _appendHit(self, hit):
319 remote_addr = hit['remote_addr']
320
321 if not remote_addr: return
322
323 for ip in self.excluded_ip:
324 if ip.match(remote_addr):
325 return
326
327 if not remote_addr in self.current_analysis['visits'].keys():
328 self._createVisitor(hit)
329
330 super_hit = self.current_analysis['visits'][remote_addr]
331 # Don't keep all requests for robots
332 if not super_hit['robot']:
333 super_hit['requests'].append(hit)
334
335 day = self.meta_infos['last_time'].tm_mday
336 if self.hasBeenViewed(hit):
337 super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
338 super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
339 super_hit['last_access'] = self.meta_infos['last_time']
340
341 request = hit['extract_request']
342
343 uri = request.get('extract_uri', request['http_uri'])
344
345 hit['is_page'] = self.isPage(uri)
346
347 if super_hit['robot'] or\
348 not self.hasBeenViewed(hit):
349 page_key = 'not_viewed_pages'
350 hit_key = 'not_viewed_hits'
351 else:
352 page_key = 'viewed_pages'
353 hit_key = 'viewed_hits'
354
355 if hit['is_page']:
356 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
357 super_hit[page_key][0] += 1
358 else:
359 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
360 super_hit[hit_key][0] += 1
361
362 def _createVisitor(self, hit):
363 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
364 super_hit['remote_addr'] = hit['remote_addr']
365 super_hit['remote_ip'] = hit['remote_addr']
366 super_hit['viewed_pages'] = {0:0}
367 super_hit['viewed_hits'] = {0:0}
368 super_hit['not_viewed_pages'] = {0:0}
369 super_hit['not_viewed_hits'] = {0:0}
370 super_hit['bandwidth'] = {0:0}
371 super_hit['last_access'] = self.meta_infos['last_time']
372 super_hit['requests'] = []
373 super_hit['robot'] = False
374 super_hit['hit_only'] = 0
375
376 def _normalizeURI(self, uri):
377 if uri == '/': return uri
378 uri = self.final_slashes_re.sub('/', uri)
379 return uri
380
381 def _removeFinalSlashes(self, uri):
382 if uri == '/': return uri
383 return self.final_slashes_re.sub('', uri)
384
385 def _normalizeParameters(self, parameters):
386 # No parameters
387 if parameters == '?': return None
388 return parameters
389
390 def _decodeHTTPRequest(self, hit):
391 if not 'request' in hit.keys(): return False
392
393 groups = self.http_request_extracted.match(hit['request'])
394
395 if groups:
396 hit['extract_request'] = groups.groupdict("")
397 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
398 if uri_groups:
399 d = uri_groups.groupdict("")
400 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
401 if 'extract_parameters' in d.keys():
402 parameters = self._normalizeParameters(d['extract_parameters'])
403 if parameters:
404 hit['extract_request']['extract_parameters'] = parameters
405 else:
406 self.logger.warning("Bad request extraction %s" % (hit['request']))
407 return False
408
409 if hit['http_referer']:
410 referer_groups = self.uri_re.match(hit['http_referer'])
411 if referer_groups:
412 hit['extract_referer'] = referer_groups.groupdict("")
413 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
414 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
415 return True
416
417 def _decodeTime(self, hit):
418 try:
419 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
420 except ValueError as e:
421 if sys.version_info < (3, 2):
422 # Try without UTC value at the end (%z not recognized)
423 gmt_offset_str = hit['time_local'][-5:]
424 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
425 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
426 gmt_offset = gmt_offset_hours + gmt_offset_minutes
427 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
428 # if gmt_offset_str[0] == '-':
429 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
430 # else:
431 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
432 else:
433 raise e
434 return hit['time_decoded']
435
436 def getDisplayIndex(self):
437 cur_time = self.meta_infos['last_time']
438 filename = self.getCurDisplayPath('index.html')
439
440 return self.display.getPage(filename)
441
442 def _generateDisplayDaysStats(self):
443 cur_time = self.meta_infos['last_time']
444 title = createCurTitle(self, self._('Statistics'))
445 filename = self.getCurDisplayPath('index.html')
446 self.logger.info('==> Generate display (%s)' % (filename))
447 page = self.display.createPage(title, filename, conf.css_path)
448 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
449 page.appendBlock(link)
450
451 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
452 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
453 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
454 nb_visits = 0
455 nb_days = 0
456 for i in range(1, nb_month_days+1):
457 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
458 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
459 if i in self.current_analysis['days_stats'].keys():
460 stats = self.current_analysis['days_stats'][i]
461 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
462 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
463 nb_visits += stats['nb_visits']
464 nb_days += 1
465 else:
466 row = [full_day, 0, 0, 0, 0, 0]
467 days.appendRow(row)
468 days.setCellValue(i-1, 4, bytesToStr(row[4]))
469 days.setCellValue(i-1, 5, bytesToStr(row[5]))
470 days.appendShortTitle(day)
471 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
472 week_day = adate.weekday()
473 if week_day == 5 or week_day == 6:
474 days.setRowCSSClass(i-1, 'iwla_weekend')
475 if adate == date.today():
476 css = days.getCellCSSClass(i-1, 0)
477 if css: css = '%s %s' % (css, 'iwla_curday')
478 else: css = 'iwla_curday'
479 days.setCellCSSClass(i-1, 0, css)
480
481 stats = self.current_analysis['month_stats']
482
483 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
484 if nb_days:
485 average_row = list(map(lambda v: int(v/nb_days), row))
486 else:
487 average_row = list(map(lambda v: 0, row))
488
489 average_row[0] = self._('Average')
490 average_row[4] = bytesToStr(average_row[4])
491 average_row[5] = bytesToStr(average_row[5])
492 days.appendRow(average_row)
493
494 row[0] = self._('Total')
495 row[4] = bytesToStr(row[4])
496 row[5] = bytesToStr(row[5])
497 days.appendRow(row)
498 page.appendBlock(days)
499 self.display.addPage(page)
500
501 def _generateDisplayMonthStats(self, page, year, month_stats):
502 cur_time = time.localtime()
503 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
504 title = '%s %d' % (self._('Summary'), year)
505 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
506 graph_cols=range(1,7)
507 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
508 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
509 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
510 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
511 total = [0] * len(cols)
512 for i in range(1, 13):
513 month = '%s<br/>%d' % (months_name[i], year)
514 full_month = '%s %d' % (months_name[i], year)
515 if i in month_stats.keys():
516 stats = month_stats[i]
517 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
518 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
519 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
520 for j in graph_cols:
521 total[j] += row[j]
522 else:
523 row = [full_month, 0, 0, 0, 0, 0, 0, '']
524 months.appendRow(row)
525 months.setCellValue(i-1, 5, bytesToStr(row[5]))
526 months.setCellValue(i-1, 6, bytesToStr(row[6]))
527 months.appendShortTitle(month)
528 months_.appendRow(row[:-1])
529 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
530 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
531 months_.appendShortTitle(month)
532 if year == cur_time.tm_year and i == cur_time.tm_mon:
533 css = months.getCellCSSClass(i-1, 0)
534 if css: css = '%s %s' % (css, 'iwla_curday')
535 else: css = 'iwla_curday'
536 months.setCellCSSClass(i-1, 0, css)
537 months_.setCellCSSClass(i-1, 0, css)
538
539 total[0] = self._('Total')
540 total[5] = bytesToStr(total[5])
541 total[6] = bytesToStr(total[6])
542 total[7] = u''
543 months.appendRow(total)
544 page.appendBlock(months)
545
546 months_.appendRow(total[:-1])
547 filename = '%d/_stats.html' % (year)
548 page_ = self.display.createPage(u'', filename, conf.css_path)
549 page_.appendBlock(months_)
550 page_.build(conf.DISPLAY_ROOT, False)
551
552 def _generateDisplayWholeMonthStats(self):
553 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
554 filename = 'index.html'
555
556 self.logger.info('==> Generate main page (%s)' % (filename))
557
558 page = self.display.createPage(title, filename, conf.css_path)
559
560 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
561 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
562 duration = datetime.now() - self.start_time
563 duration = time.gmtime(duration.seconds)
564 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
565 if duration.tm_hour:
566 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
567 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
568 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
569
570 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
571 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
572
573 self.display.addPage(page)
574
575 def _compressFile(self, root, filename):
576 path = os.path.join(root, filename)
577 gz_path = path + '.gz'
578
579 self.logger.debug('Compress %s => %s' % (path, gz_path))
580
581 if not os.path.exists(gz_path) or\
582 os.stat(path).st_mtime > os.stat(gz_path).st_mtime:
583 if self.dry_run: return
584 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
585 f_out.write(f_in.read())
586
587 def _compressFiles(self, root):
588 if not conf.compress_output_files: return
589 for rootdir, subdirs, files in os.walk(root, followlinks=True):
590 for f in files:
591 for ext in conf.compress_output_files:
592 if f.endswith(ext):
593 self._compressFile(rootdir, f)
594 break
595
596 def _generateDisplay(self):
597 self._generateDisplayDaysStats()
598 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
599 self._generateDisplayWholeMonthStats()
600 self.display.build(conf.DISPLAY_ROOT)
601 self._compressFiles(conf.DISPLAY_ROOT)
602
603 def _createEmptyStats(self):
604 stats = {}
605 stats['viewed_bandwidth'] = 0
606 stats['not_viewed_bandwidth'] = 0
607 stats['viewed_pages'] = 0
608 stats['viewed_hits'] = 0
609 stats['nb_visits'] = 0
610
611 return stats
612
613 def _generateMonthStats(self):
614 self._clearDisplay()
615
616 visits = self.current_analysis['visits']
617
618 stats = self._createEmptyStats()
619 for (day, stat) in self.current_analysis['days_stats'].items():
620 for k in stats.keys():
621 stats[k] += stat[k]
622
623 duplicated_stats = {k:v for (k,v) in stats.items()}
624
625 cur_time = self.meta_infos['last_time']
626 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
627 self.logger.info(stats)
628
629 if not 'month_stats' in self.current_analysis.keys():
630 self.current_analysis['month_stats'] = stats
631 else:
632 for (k,v) in stats.items():
633 self.current_analysis['month_stats'][k] = v
634
635 self.valid_visitors = {}
636 for (k,v) in visits.items():
637 if self.isValidVisitor(v):
638 self.valid_visitors[k] = v
639
640 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
641
642 self._callPlugins(conf.POST_HOOK_DIRECTORY)
643
644 if args.display_only:
645 if not 'stats' in self.meta_infos.keys():
646 self.meta_infos['stats'] = {}
647 self._generateDisplay()
648 return
649
650 path = self.getDBFilename(cur_time)
651
652 self.logger.info("==> Serialize to %s" % (path))
653 self._serialize(self.current_analysis, path)
654
655 # Save month stats
656 year = cur_time.tm_year
657 month = cur_time.tm_mon
658 if not 'stats' in self.meta_infos.keys():
659 self.meta_infos['stats'] = {}
660 if not year in self.meta_infos['stats'].keys():
661 self.meta_infos['stats'][year] = {}
662 self.meta_infos['stats'][year][month] = duplicated_stats
663
664 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
665 self._serialize(self.meta_infos, conf.META_PATH)
666
667 self._generateDisplay()
668
669 def _generateDayStats(self):
670 if args.display_only:
671 return
672
673 visits = self.current_analysis['visits']
674 cur_time = self.meta_infos['last_time']
675
676 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
677
678 stats = self._createEmptyStats()
679
680 day = cur_time.tm_mday
681 for (k, super_hit) in visits.items():
682 if super_hit['last_access'].tm_mday != day:
683 continue
684 if super_hit['robot']:
685 stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
686 continue
687 stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
688 stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
689 stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
690 if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
691 super_hit['viewed_pages'].get(day, 0)):
692 stats['nb_visits'] += 1
693
694 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
695 self.logger.info(stats)
696
697 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
698
699 def _newHit(self, hit):
700 if not self.domain_name_re.match(hit['server_name']):
701 self.logger.debug("Not in domain %s" % (hit))
702 return False
703
704 t = self._decodeTime(hit)
705
706 cur_time = self.meta_infos['last_time']
707
708 if cur_time == None:
709 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
710 self.analyse_started = True
711 else:
712 if not self.analyse_started and\
713 time.mktime(t) <= time.mktime(cur_time):
714 self.logger.debug("Not in time")
715 return False
716 self.analyse_started = True
717 if t < cur_time: # Don't accept past hits
718 return False
719 if cur_time.tm_mon != t.tm_mon:
720 self._generateDayStats()
721 self._generateMonthStats()
722 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
723 elif cur_time.tm_mday != t.tm_mday:
724 self._generateDayStats()
725
726 self.meta_infos['last_time'] = t
727
728 if not self.meta_infos['start_analysis_time']:
729 self.meta_infos['start_analysis_time'] = t
730
731 if not self._decodeHTTPRequest(hit): return False
732
733 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
734 return False
735
736 for k in hit.keys():
737 if hit[k] == '-' or hit[k] == '*':
738 hit[k] = ''
739
740 self._appendHit(hit)
741
742 return True
743
744 def _reset(self):
745 reset_time = time.strptime(self.args.reset, '%m/%Y')
746
747 self.logger.info('Reset time')
748 self.logger.info(reset_time)
749
750 self.meta_infos['last_time'] = reset_time
751
752 cur_time = time.localtime()
753 year = reset_time.tm_year
754 while year < cur_time.tm_year:
755 db_path = os.path.join(conf.DB_ROOT, str(year))
756 if os.path.exists(db_path): shutil.rmtree(db_path)
757 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
758 if os.path.exists(output_path): shutil.rmtree(output_path)
759 year += 1
760 month = reset_time.tm_mon
761 while month <= cur_time.tm_mon:
762 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
763 if os.path.exists(db_path): shutil.rmtree(db_path)
764 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
765 if os.path.exists(output_path): shutil.rmtree(output_path)
766 month += 1
767
768 def start(self, _file, args):
769 self.args = args
770 self.start_time = datetime.now()
771
772 self.logger.info('==> Load previous database')
773
774 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
775 if self.meta_infos['last_time']:
776 if args.reset:
777 self._reset()
778 self.logger.info('Last time')
779 self.logger.info(self.meta_infos['last_time'])
780 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
781 else:
782 self._clearVisits()
783
784 self.meta_infos['start_analysis_time'] = None
785
786 self.cache_plugins = preloadPlugins(self.plugins, self)
787
788 self.logger.info('==> Analysing log')
789
790 for l in _file:
791 # print "line " + l
792
793 groups = self.log_re.match(l)
794
795 if groups:
796 self._newHit(groups.groupdict(""))
797 else:
798 self.logger.warning("No match for %s" % (l))
799 #break
800
801 if self.analyse_started:
802 self._generateDayStats()
803 self._generateMonthStats()
804 del self.meta_infos['start_analysis_time']
805 else:
806 self.logger.info('==> Analyse not started : nothing new')
807
808
809class FileIter(object):
810 def __init__(self, filenames):
811 self.filenames = [f for f in filenames.split(',') if f]
812 for f in self.filenames:
813 if not os.path.exists(f):
814 print('No such file \'%s\'' % (f))
815 sys.exit(-1)
816 self.cur_file = None
817 self._openNextFile()
818
819 def __iter__(self):
820 return self
821
822 def __next__(self):
823 return self.next()
824
825 def _openNextFile(self):
826 if self.cur_file:
827 self.cur_file.close()
828 self.cur_file = None
829 if not self.filenames:
830 raise StopIteration()
831 filename = self.filenames.pop(0)
832 if filename.endswith('gz'):
833 self.cur_file = gzip.open(filename, 'r')
834 else:
835 self.cur_file = open(filename)
836
837 def next(self):
838 l = self.cur_file.readline()
839 if not l:
840 self._openNextFile()
841 l = self.cur_file.readline()
842 return l[:-1]
843
844if __name__ == '__main__':
845 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
846
847 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
848 default=False,
849 help='Clean output before starting')
850
851 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
852 default=False,
853 help='Read data from stdin instead of conf.analyzed_filename')
854
855 parser.add_argument('-f', '--file', dest='file',
856 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
857
858 parser.add_argument('-d', '--log-level', dest='loglevel',
859 default='INFO', type=str,
860 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
861
862 parser.add_argument('-r', '--reset', dest='reset',
863 default=False,
864 help='Reset analysis to a specific date (month/year)')
865
866 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
867 default=False,
868 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
869
870 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
871 default=False,
872 help='Only generate display')
873
874 parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
875 default=False,
876 help='Process log but don\'t write files (database and HTML) to disk')
877
878 args = parser.parse_args()
879
880 # Load user conf
881 for (k,v) in user_conf.__dict__.items():
882 if k.endswith('_append'):
883 new_k = k[:-7]
884 if new_k in dir(conf):
885 if type(conf.__dict__[new_k]) == list:
886 if type(v) == list:
887 conf.__dict__[new_k] += v
888 else:
889 conf.__dict__[new_k].append(v)
890 else:
891 print("Error %s is not a list" % (new_k))
892 else:
893 print("Error %s doesn't exists in default conf" % (new_k))
894 else:
895 conf.__dict__.update({k:v})
896
897 if args.clean_output and not args.dry_run:
898 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
899 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
900
901 loglevel = getattr(logging, args.loglevel.upper(), None)
902 if not isinstance(loglevel, int):
903 raise ValueError('Invalid log level: %s' % (args.loglevel))
904
905 iwla = IWLA(loglevel, args.dry_run)
906
907 required_conf = ['analyzed_filename', 'domain_name']
908 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
909 sys.exit(0)
910
911 if args.stdin:
912 iwla.start(sys.stdin, args)
913 else:
914 filename = args.file or conf.analyzed_filename
915 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags