iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import cPickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages{0..31} # 0 contains total
99 viewed_hits{0..31} # 0 contains total
100 not_viewed_pages{0..31}
101 not_viewed_hits{0..31}
102 bandwidth{0..31}
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.5-dev'
135
136 def __init__(self, logLevel, dry_run):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144 self.dry_run = dry_run
145
146 self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
147 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
148 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
149 self.log_re = re.compile(self.log_format_extracted)
150 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
151 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
152 self.final_slashes_re = re.compile(r'/+$')
153 self.excluded_ip = []
154 for ip in conf.excluded_ip:
155 self.excluded_ip += [re.compile(ip)]
156 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
157 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
158 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
159
160 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
161 self.logger = logging.getLogger(self.__class__.__name__)
162 if self.dry_run:
163 self.logger.info('==> Start (DRY RUN)')
164 else:
165 self.logger.info('==> Start')
166 try:
167 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
168 self.logger.info('\tUsing locale %s' % (conf.locale))
169 except IOError:
170 t = gettext.NullTranslations()
171 self.logger.info('\tUsing default locale en_EN')
172 self._ = t.ugettext
173
174 def getVersion(self):
175 return IWLA.IWLA_VERSION
176
177 def getConfValue(self, key, default=None):
178 if not key in dir(conf):
179 return default
180 else:
181 return conf.__dict__[key]
182
183 def _clearVisits(self):
184 self.current_analysis = {
185 'days_stats' : {},
186 'month_stats' : {},
187 'visits' : {}
188 }
189 self.valid_visitors = None
190 return self.current_analysis
191
192 def getDaysStats(self):
193 return self.current_analysis['days_stats']
194
195 def getMonthStats(self):
196 return self.current_analysis['month_stats']
197
198 def getCurrentVisits(self):
199 return self.current_analysis['visits']
200
201 def getValidVisitors(self):
202 return self.valid_visitors
203
204 def getDisplay(self):
205 return self.display
206
207 def getCurTime(self):
208 return self.meta_infos['last_time']
209
210 def getStartAnalysisTime(self):
211 return self.meta_infos['start_analysis_time']
212
213 def isValidForCurrentAnalysis(self, request):
214 cur_time = self.meta_infos['start_analysis_time']
215 # Analyse not started
216 if not cur_time: return False
217 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
218
219 def hasBeenViewed(self, request):
220 return int(request['status']) in conf.viewed_http_codes
221
222 def getCurDisplayPath(self, filename):
223 cur_time = self.meta_infos['last_time']
224 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
225
226 def getResourcesPath(self):
227 return conf.resources_path
228
229 def getCSSPath(self):
230 return conf.css_path
231
232 def _clearMeta(self):
233 self.meta_infos = {
234 'last_time' : None,
235 'start_analysis_time' : None
236 }
237 return self.meta_infos
238
239 def _clearDisplay(self):
240 self.display.clear()
241return self.display
242
243 def getDBFilename(self, time):
244 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
245
246 def _openDB(self, filename, prot='r'):
247 if self.args.dont_compress:
248 return open(filename, prot)
249 else:
250 return gzip.open(filename, prot)
251
252 def _serialize(self, obj, filename):
253 if self.dry_run: return
254 base = os.path.dirname(filename)
255 if not os.path.exists(base):
256 os.makedirs(base)
257
258 # Make a backup in case of something fails
259 if os.path.exists(filename):
260 shutil.copy(filename, filename + '.bak')
261
262 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
263 cPickle.dump(obj, f)
264 f.seek(0)
265 fzip.write(f.read())
266 os.fsync(fzip)
267 os.remove(filename + '.tmp')
268 if os.path.exists(filename + '.bak'):
269 os.remove(filename + '.bak')
270
271 def _deserialize(self, filename):
272 if not os.path.exists(filename):
273 return None
274
275 res = None
276 with self._openDB(filename) as f:
277 res = cPickle.load(f)
278 return res
279
280 def _callPlugins(self, target_root, *args):
281 self.logger.info('==> Call plugins (%s)' % (target_root))
282 for (root, plugins) in self.plugins:
283 if root != target_root: continue
284 for p in plugins:
285 mod = self.cache_plugins.get(root + '.' + p, None)
286 if mod:
287 self.logger.info('\t%s' % (p))
288 mod.hook(*args)
289
290 def isPage(self, request):
291 self.logger.debug("Is page %s" % (request))
292 for e in conf.pages_extensions:
293 if request.endswith(e):
294 self.logger.debug("True")
295 return True
296 self.logger.debug("False")
297 return False
298
299 def isMultimediaFile(self, request):
300 self.logger.debug("Is multimedia %s" % (request))
301 for e in conf.multimedia_files:
302 if request.endswith(e):
303 self.logger.debug("True")
304 return True
305 self.logger.debug("False")
306 return False
307
308 def isValidVisitor(self, hit):
309 if hit['robot']: return False
310 if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
311 return False
312 return True
313
314 def isRobot(self, hit):
315 return hit['robot']
316
317 def _appendHit(self, hit):
318 remote_addr = hit['remote_addr']
319
320 if not remote_addr: return
321
322 for ip in self.excluded_ip:
323 if ip.match(remote_addr):
324 return
325
326 if not remote_addr in self.current_analysis['visits'].keys():
327 self._createVisitor(hit)
328
329 super_hit = self.current_analysis['visits'][remote_addr]
330 # Don't keep all requests for robots
331 if not super_hit['robot']:
332 super_hit['requests'].append(hit)
333
334 day = self.meta_infos['last_time'].tm_mday
335 if self.hasBeenViewed(hit):
336 super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
337 super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
338 super_hit['last_access'] = self.meta_infos['last_time']
339
340 request = hit['extract_request']
341
342 uri = request.get('extract_uri', request['http_uri'])
343
344 hit['is_page'] = self.isPage(uri)
345
346 if super_hit['robot'] or\
347 not self.hasBeenViewed(hit):
348 page_key = 'not_viewed_pages'
349 hit_key = 'not_viewed_hits'
350 else:
351 page_key = 'viewed_pages'
352 hit_key = 'viewed_hits'
353
354 if hit['is_page']:
355 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
356 super_hit[page_key][0] += 1
357 else:
358 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
359 super_hit[hit_key][0] += 1
360
361 def _createVisitor(self, hit):
362 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
363 super_hit['remote_addr'] = hit['remote_addr']
364 super_hit['remote_ip'] = hit['remote_addr']
365 super_hit['viewed_pages'] = {0:0}
366 super_hit['viewed_hits'] = {0:0}
367 super_hit['not_viewed_pages'] = {0:0}
368 super_hit['not_viewed_hits'] = {0:0}
369 super_hit['bandwidth'] = {0:0}
370 super_hit['last_access'] = self.meta_infos['last_time']
371 super_hit['requests'] = []
372 super_hit['robot'] = False
373 super_hit['hit_only'] = 0
374
375 def _normalizeURI(self, uri):
376 if uri == '/': return uri
377 uri = self.final_slashes_re.sub('/', uri)
378 return uri
379
380 def _removeFinalSlashes(self, uri):
381 if uri == '/': return uri
382 return self.final_slashes_re.sub('', uri)
383
384 def _normalizeParameters(self, parameters):
385 # No parameters
386 if parameters == '?': return None
387 return parameters
388
389 def _decodeHTTPRequest(self, hit):
390 if not 'request' in hit.keys(): return False
391
392 groups = self.http_request_extracted.match(hit['request'])
393
394 if groups:
395 hit['extract_request'] = groups.groupdict("")
396 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
397 if uri_groups:
398 d = uri_groups.groupdict("")
399 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
400 if 'extract_parameters' in d.keys():
401 parameters = self._normalizeParameters(d['extract_parameters'])
402 if parameters:
403 hit['extract_request']['extract_parameters'] = parameters
404 else:
405 self.logger.warning("Bad request extraction %s" % (hit['request']))
406 return False
407
408 if hit['http_referer']:
409 referer_groups = self.uri_re.match(hit['http_referer'])
410 if referer_groups:
411 hit['extract_referer'] = referer_groups.groupdict("")
412 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
413 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
414 return True
415
416 def _decodeTime(self, hit):
417 try:
418 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
419 except ValueError, e:
420 if sys.version_info < (3, 2):
421 # Try without UTC value at the end (%z not recognized)
422 gmt_offset_str = hit['time_local'][-5:]
423 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
424 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
425 gmt_offset = gmt_offset_hours + gmt_offset_minutes
426 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
427 # if gmt_offset_str[0] == '-':
428 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
429 # else:
430 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
431 else:
432 raise e
433 return hit['time_decoded']
434
435 def getDisplayIndex(self):
436 cur_time = self.meta_infos['last_time']
437 filename = self.getCurDisplayPath('index.html')
438
439 return self.display.getPage(filename)
440
441 def _generateDisplayDaysStats(self):
442 cur_time = self.meta_infos['last_time']
443 title = createCurTitle(self, self._('Statistics'))
444 filename = self.getCurDisplayPath('index.html')
445 self.logger.info('==> Generate display (%s)' % (filename))
446 page = self.display.createPage(title, filename, conf.css_path)
447 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
448 page.appendBlock(link)
449
450 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
451 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
452 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
453 nb_visits = 0
454 nb_days = 0
455 for i in range(1, nb_month_days+1):
456 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
457 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
458 if i in self.current_analysis['days_stats'].keys():
459 stats = self.current_analysis['days_stats'][i]
460 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
461 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
462 nb_visits += stats['nb_visits']
463 nb_days += 1
464 else:
465 row = [full_day, 0, 0, 0, 0, 0]
466 days.appendRow(row)
467 days.setCellValue(i-1, 4, bytesToStr(row[4]))
468 days.setCellValue(i-1, 5, bytesToStr(row[5]))
469 days.appendShortTitle(day)
470 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
471 week_day = adate.weekday()
472 if week_day == 5 or week_day == 6:
473 days.setRowCSSClass(i-1, 'iwla_weekend')
474 if adate == date.today():
475 css = days.getCellCSSClass(i-1, 0)
476 if css: css = '%s %s' % (css, 'iwla_curday')
477 else: css = 'iwla_curday'
478 days.setCellCSSClass(i-1, 0, css)
479
480 stats = self.current_analysis['month_stats']
481
482 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
483 if nb_days:
484 average_row = map(lambda(v): int(v/nb_days), row)
485 else:
486 average_row = map(lambda(v): 0, row)
487
488 average_row[0] = self._('Average')
489 average_row[4] = bytesToStr(average_row[4])
490 average_row[5] = bytesToStr(average_row[5])
491 days.appendRow(average_row)
492
493 row[0] = self._('Total')
494 row[4] = bytesToStr(row[4])
495 row[5] = bytesToStr(row[5])
496 days.appendRow(row)
497 page.appendBlock(days)
498 self.display.addPage(page)
499
500 def _generateDisplayMonthStats(self, page, year, month_stats):
501 cur_time = time.localtime()
502 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
503 title = '%s %d' % (self._('Summary'), year)
504 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
505 graph_cols=range(1,7)
506 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
507 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
508 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
509 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
510 total = [0] * len(cols)
511 for i in range(1, 13):
512 month = '%s<br/>%d' % (months_name[i], year)
513 full_month = '%s %d' % (months_name[i], year)
514 if i in month_stats.keys():
515 stats = month_stats[i]
516 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
517 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
518 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
519 for j in graph_cols:
520 total[j] += row[j]
521 else:
522 row = [full_month, 0, 0, 0, 0, 0, 0, '']
523 months.appendRow(row)
524 months.setCellValue(i-1, 5, bytesToStr(row[5]))
525 months.setCellValue(i-1, 6, bytesToStr(row[6]))
526 months.appendShortTitle(month)
527 months_.appendRow(row[:-1])
528 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
529 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
530 months_.appendShortTitle(month)
531 if year == cur_time.tm_year and i == cur_time.tm_mon:
532 css = months.getCellCSSClass(i-1, 0)
533 if css: css = '%s %s' % (css, 'iwla_curday')
534 else: css = 'iwla_curday'
535 months.setCellCSSClass(i-1, 0, css)
536 months_.setCellCSSClass(i-1, 0, css)
537
538 total[0] = self._('Total')
539 total[5] = bytesToStr(total[5])
540 total[6] = bytesToStr(total[6])
541 total[7] = u''
542 months.appendRow(total)
543 page.appendBlock(months)
544
545 months_.appendRow(total[:-1])
546 filename = '%d/_stats.html' % (year)
547 page_ = self.display.createPage(u'', filename, conf.css_path)
548 page_.appendBlock(months_)
549 page_.build(conf.DISPLAY_ROOT, False)
550
551 def _generateDisplayWholeMonthStats(self):
552 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
553 filename = 'index.html'
554
555 self.logger.info('==> Generate main page (%s)' % (filename))
556
557 page = self.display.createPage(title, filename, conf.css_path)
558
559 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
560 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
561 duration = datetime.now() - self.start_time
562 duration = time.gmtime(duration.seconds)
563 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
564 if duration.tm_hour:
565 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
566 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
567 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
568
569 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
570 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
571
572 self.display.addPage(page)
573
574 def _compressFile(self, build_time, root, filename):
575 path = os.path.join(root, filename)
576 gz_path = path + '.gz'
577
578 self.logger.debug('Compress %s => %s' % (path, gz_path))
579
580 if not os.path.exists(gz_path) or\
581 os.stat(path).st_mtime >= build_time:
582 if self.dry_run: return
583 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
584 f_out.write(f_in.read())
585
586 def _compressFiles(self, build_time, root):
587 if not conf.compress_output_files: return
588 for rootdir, subdirs, files in os.walk(root, followlinks=True):
589 for f in files:
590 for ext in conf.compress_output_files:
591 if f.endswith(ext):
592 self._compressFile(build_time, rootdir, f)
593 break
594
595 def _generateDisplay(self):
596 self._generateDisplayDaysStats()
597 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
598 self._generateDisplayWholeMonthStats()
599 build_time = time.mktime(time.localtime())
600 self.display.build(conf.DISPLAY_ROOT)
601 self._compressFiles(build_time, conf.DISPLAY_ROOT)
602
603 def _createEmptyStats(self):
604 stats = {}
605 stats['viewed_bandwidth'] = 0
606 stats['not_viewed_bandwidth'] = 0
607 stats['viewed_pages'] = 0
608 stats['viewed_hits'] = 0
609 stats['nb_visits'] = 0
610
611 return stats
612
613 def _generateMonthStats(self):
614 self._clearDisplay()
615
616 visits = self.current_analysis['visits']
617
618 stats = self._createEmptyStats()
619 for (day, stat) in self.current_analysis['days_stats'].items():
620 for k in stats.keys():
621 stats[k] += stat[k]
622
623 duplicated_stats = {k:v for (k,v) in stats.items()}
624
625 cur_time = self.meta_infos['last_time']
626 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
627 self.logger.info(stats)
628
629 if not 'month_stats' in self.current_analysis.keys():
630 self.current_analysis['month_stats'] = stats
631 else:
632 for (k,v) in stats.items():
633 self.current_analysis['month_stats'][k] = v
634
635 self.valid_visitors = {}
636 for (k,v) in visits.items():
637 if self.isValidVisitor(v):
638 self.valid_visitors[k] = v
639
640 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
641
642 self._callPlugins(conf.POST_HOOK_DIRECTORY)
643
644 if args.display_only:
645 self._generateDisplay()
646 return
647
648 path = self.getDBFilename(cur_time)
649
650 self.logger.info("==> Serialize to %s" % (path))
651 self._serialize(self.current_analysis, path)
652
653 # Save month stats
654 year = cur_time.tm_year
655 month = cur_time.tm_mon
656 if not 'stats' in self.meta_infos.keys():
657 self.meta_infos['stats'] = {}
658 if not year in self.meta_infos['stats'].keys():
659 self.meta_infos['stats'][year] = {}
660 self.meta_infos['stats'][year][month] = duplicated_stats
661
662 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
663 self._serialize(self.meta_infos, conf.META_PATH)
664
665 self._generateDisplay()
666
667 def _generateDayStats(self):
668 if args.display_only:
669 return
670
671 visits = self.current_analysis['visits']
672 cur_time = self.meta_infos['last_time']
673
674 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
675
676 stats = self._createEmptyStats()
677
678 day = cur_time.tm_mday
679 for (k, super_hit) in visits.items():
680 if super_hit['last_access'].tm_mday != day:
681 continue
682 if super_hit['robot']:
683 stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
684 continue
685 stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
686 stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
687 stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
688 if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
689 super_hit['viewed_pages'].get(day, 0)):
690 stats['nb_visits'] += 1
691
692 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
693 self.logger.info(stats)
694
695 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
696
697 def _newHit(self, hit):
698 if not self.domain_name_re.match(hit['server_name']):
699 self.logger.debug("Not in domain %s" % (hit))
700 return False
701
702 t = self._decodeTime(hit)
703
704 cur_time = self.meta_infos['last_time']
705
706 if cur_time == None:
707 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
708 self.analyse_started = True
709 else:
710 if not self.analyse_started and\
711 time.mktime(t) <= time.mktime(cur_time):
712 self.logger.debug("Not in time")
713 return False
714 self.analyse_started = True
715 if t < cur_time: # Don't accept past hits
716 return False
717 if cur_time.tm_mon != t.tm_mon:
718 self._generateDayStats()
719 self._generateMonthStats()
720 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
721 elif cur_time.tm_mday != t.tm_mday:
722 self._generateDayStats()
723
724 self.meta_infos['last_time'] = t
725
726 if not self.meta_infos['start_analysis_time']:
727 self.meta_infos['start_analysis_time'] = t
728
729 if not self._decodeHTTPRequest(hit): return False
730
731 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
732 return False
733
734 for k in hit.keys():
735 if hit[k] == '-' or hit[k] == '*':
736 hit[k] = ''
737
738 self._appendHit(hit)
739
740 return True
741
742 def _reset(self):
743 reset_time = time.strptime(self.args.reset, '%m/%Y')
744
745 self.logger.info('Reset time')
746 self.logger.info(reset_time)
747
748 self.meta_infos['last_time'] = reset_time
749
750 cur_time = time.localtime()
751 year = reset_time.tm_year
752 while year < cur_time.tm_year:
753 db_path = os.path.join(conf.DB_ROOT, str(year))
754 if os.path.exists(db_path): shutil.rmtree(db_path)
755 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
756 if os.path.exists(output_path): shutil.rmtree(output_path)
757 year += 1
758 month = reset_time.tm_mon
759 while month <= cur_time.tm_mon:
760 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
761 if os.path.exists(db_path): shutil.rmtree(db_path)
762 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
763 if os.path.exists(output_path): shutil.rmtree(output_path)
764 month += 1
765
766 def start(self, _file, args):
767 self.args = args
768 self.start_time = datetime.now()
769
770 self.logger.info('==> Load previous database')
771
772 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
773 if self.meta_infos['last_time']:
774 if args.reset:
775 self._reset()
776 self.logger.info('Last time')
777 self.logger.info(self.meta_infos['last_time'])
778 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
779 else:
780 self._clearVisits()
781
782 self.meta_infos['start_analysis_time'] = None
783
784 self.cache_plugins = preloadPlugins(self.plugins, self)
785
786 self.logger.info('==> Analysing log')
787
788 for l in _file:
789 # print "line " + l
790
791 groups = self.log_re.match(l)
792
793 if groups:
794 self._newHit(groups.groupdict(""))
795 else:
796 self.logger.warning("No match for %s" % (l))
797 #break
798
799 if self.analyse_started:
800 self._generateDayStats()
801 self._generateMonthStats()
802 del self.meta_infos['start_analysis_time']
803 else:
804 self.logger.info('==> Analyse not started : nothing new')
805
806
807class FileIter(object):
808 def __init__(self, filenames):
809 self.filenames = [f for f in filenames.split(',') if f]
810 for f in self.filenames:
811 if not os.path.exists(f):
812 print 'No such file \'%s\'' % (f)
813 sys.exit(-1)
814 self.cur_file = None
815 self._openNextFile()
816
817 def __iter__(self):
818 return self
819
820 def __next__(self):
821 return self.next()
822
823 def _openNextFile(self):
824 if self.cur_file:
825 self.cur_file.close()
826 self.cur_file = None
827 if not self.filenames:
828 raise StopIteration()
829 filename = self.filenames.pop(0)
830 if filename.endswith('gz'):
831 self.cur_file = gzip.open(filename, 'r')
832 else:
833 self.cur_file = open(filename)
834
835 def next(self):
836 l = self.cur_file.readline()
837 if not l:
838 self._openNextFile()
839 l = self.cur_file.readline()
840 return l[:-1]
841
842if __name__ == '__main__':
843 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
844
845 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
846 default=False,
847 help='Clean output before starting')
848
849 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
850 default=False,
851 help='Read data from stdin instead of conf.analyzed_filename')
852
853 parser.add_argument('-f', '--file', dest='file',
854 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
855
856 parser.add_argument('-d', '--log-level', dest='loglevel',
857 default='INFO', type=str,
858 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
859
860 parser.add_argument('-r', '--reset', dest='reset',
861 default=False,
862 help='Reset analysis to a specific date (month/year)')
863
864 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
865 default=False,
866 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
867
868 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
869 default=False,
870 help='Only generate display')
871
872 parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
873 default=False,
874 help='Process log but don\'t write files (database and HTML) to disk')
875
876 args = parser.parse_args()
877
878 # Load user conf
879 for (k,v) in user_conf.__dict__.items():
880 if k.endswith('_append'):
881 new_k = k[:-7]
882 if new_k in dir(conf):
883 if type(conf.__dict__[new_k]) == list:
884 if type(v) == list:
885 conf.__dict__[new_k] += v
886 else:
887 conf.__dict__[new_k].append(v)
888 else:
889 print("Error %s is not a list" % (new_k))
890 else:
891 print("Error %s doesn't exists in default conf" % (new_k))
892 else:
893 conf.__dict__.update({k:v})
894
895 if args.clean_output and not args.dry_run:
896 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
897 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
898
899 loglevel = getattr(logging, args.loglevel.upper(), None)
900 if not isinstance(loglevel, int):
901 raise ValueError('Invalid log level: %s' % (args.loglevel))
902
903 iwla = IWLA(loglevel, args.dry_run)
904
905 required_conf = ['analyzed_filename', 'domain_name']
906 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
907 sys.exit(0)
908
909 if args.stdin:
910 iwla.start(sys.stdin, args)
911 else:
912 filename = args.file or conf.analyzed_filename
913 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags