iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import cPickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages{0..31} # 0 contains total
99 viewed_hits{0..31} # 0 contains total
100 not_viewed_pages{0..31}
101 not_viewed_hits{0..31}
102 bandwidth{0..31}
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.5-dev'
135
136 def __init__(self, logLevel, dry_run):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144 self.dry_run = dry_run
145
146 self.log_format_extracted = re.sub(r'([^\$?\w])', r'\\\g<1>', conf.log_format)
147 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
148 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
149 self.log_re = re.compile(self.log_format_extracted)
150 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
151 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
152 self.final_slashes_re = re.compile(r'/+$')
153 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
154 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
155 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
156
157 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
158 self.logger = logging.getLogger(self.__class__.__name__)
159 if self.dry_run:
160 self.logger.info('==> Start (DRY RUN)')
161 else:
162 self.logger.info('==> Start')
163 try:
164 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
165 self.logger.info('\tUsing locale %s' % (conf.locale))
166 except IOError:
167 t = gettext.NullTranslations()
168 self.logger.info('\tUsing default locale en_EN')
169 self._ = t.ugettext
170
171 def getVersion(self):
172 return IWLA.IWLA_VERSION
173
174 def getConfValue(self, key, default=None):
175 if not key in dir(conf):
176 return default
177 else:
178 return conf.__dict__[key]
179
180 def _clearVisits(self):
181 self.current_analysis = {
182 'days_stats' : {},
183 'month_stats' : {},
184 'visits' : {}
185 }
186 self.valid_visitors = None
187 return self.current_analysis
188
189 def getDaysStats(self):
190 return self.current_analysis['days_stats']
191
192 def getMonthStats(self):
193 return self.current_analysis['month_stats']
194
195 def getCurrentVisits(self):
196 return self.current_analysis['visits']
197
198 def getValidVisitors(self):
199 return self.valid_visitors
200
201 def getDisplay(self):
202 return self.display
203
204 def getCurTime(self):
205 return self.meta_infos['last_time']
206
207 def getStartAnalysisTime(self):
208 return self.meta_infos['start_analysis_time']
209
210 def isValidForCurrentAnalysis(self, request):
211 cur_time = self.meta_infos['start_analysis_time']
212 # Analyse not started
213 if not cur_time: return False
214 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
215
216 def hasBeenViewed(self, request):
217 return int(request['status']) in conf.viewed_http_codes
218
219 def getCurDisplayPath(self, filename):
220 cur_time = self.meta_infos['last_time']
221 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
222
223 def getResourcesPath(self):
224 return conf.resources_path
225
226 def getCSSPath(self):
227 return conf.css_path
228
229 def _clearMeta(self):
230 self.meta_infos = {
231 'last_time' : None,
232 'start_analysis_time' : None
233 }
234 return self.meta_infos
235
236 def _clearDisplay(self):
237 self.display.clear()
238return self.display
239
240 def getDBFilename(self, time):
241 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
242
243 def _openDB(self, filename, prot='r'):
244 if self.args.dont_compress:
245 return open(filename, prot)
246 else:
247 return gzip.open(filename, prot)
248
249 def _serialize(self, obj, filename):
250 if self.dry_run: return
251 base = os.path.dirname(filename)
252 if not os.path.exists(base):
253 os.makedirs(base)
254
255 # Make a backup in case of something fails
256 if os.path.exists(filename):
257 shutil.copy(filename, filename + '.bak')
258
259 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
260 cPickle.dump(obj, f)
261 f.seek(0)
262 fzip.write(f.read())
263 os.fsync(fzip)
264 os.remove(filename + '.tmp')
265 if os.path.exists(filename + '.bak'):
266 os.remove(filename + '.bak')
267
268 def _deserialize(self, filename):
269 if not os.path.exists(filename):
270 return None
271
272 res = None
273 with self._openDB(filename) as f:
274 res = cPickle.load(f)
275 return res
276
277 def _callPlugins(self, target_root, *args):
278 self.logger.info('==> Call plugins (%s)' % (target_root))
279 for (root, plugins) in self.plugins:
280 if root != target_root: continue
281 for p in plugins:
282 mod = self.cache_plugins.get(root + '.' + p, None)
283 if mod:
284 self.logger.info('\t%s' % (p))
285 mod.hook(*args)
286
287 def isPage(self, request):
288 self.logger.debug("Is page %s" % (request))
289 for e in conf.pages_extensions:
290 if request.endswith(e):
291 self.logger.debug("True")
292 return True
293 self.logger.debug("False")
294 return False
295
296 def isMultimediaFile(self, request):
297 self.logger.debug("Is multimedia %s" % (request))
298 for e in conf.multimedia_files:
299 if request.endswith(e):
300 self.logger.debug("True")
301 return True
302 self.logger.debug("False")
303 return False
304
305 def isValidVisitor(self, hit):
306 if hit['robot']: return False
307 if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
308 return False
309 return True
310
311 def isRobot(self, hit):
312 return hit['robot']
313
314 def _appendHit(self, hit):
315 remote_addr = hit['remote_addr']
316
317 if not remote_addr: return
318
319 if not remote_addr in self.current_analysis['visits'].keys():
320 self._createVisitor(hit)
321
322 super_hit = self.current_analysis['visits'][remote_addr]
323 # Don't keep all requests for robots
324 if not super_hit['robot']:
325 super_hit['requests'].append(hit)
326
327 day = self.meta_infos['last_time'].tm_mday
328 if self.hasBeenViewed(hit):
329 super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
330 super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
331 super_hit['last_access'] = self.meta_infos['last_time']
332
333 request = hit['extract_request']
334
335 uri = request.get('extract_uri', request['http_uri'])
336
337 hit['is_page'] = self.isPage(uri)
338
339 if super_hit['robot'] or\
340 not self.hasBeenViewed(hit):
341 page_key = 'not_viewed_pages'
342 hit_key = 'not_viewed_hits'
343 else:
344 page_key = 'viewed_pages'
345 hit_key = 'viewed_hits'
346
347 if hit['is_page']:
348 super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
349 super_hit[page_key][0] += 1
350 else:
351 super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
352 super_hit[hit_key][0] += 1
353
354 def _createVisitor(self, hit):
355 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
356 super_hit['remote_addr'] = hit['remote_addr']
357 super_hit['remote_ip'] = hit['remote_addr']
358 super_hit['viewed_pages'] = {0:0}
359 super_hit['viewed_hits'] = {0:0}
360 super_hit['not_viewed_pages'] = {0:0}
361 super_hit['not_viewed_hits'] = {0:0}
362 super_hit['bandwidth'] = {0:0}
363 super_hit['last_access'] = self.meta_infos['last_time']
364 super_hit['requests'] = []
365 super_hit['robot'] = False
366 super_hit['hit_only'] = 0
367
368 def _normalizeURI(self, uri):
369 if uri == '/': return uri
370 uri = self.final_slashes_re.sub('/', uri)
371 return uri
372
373 def _removeFinalSlashes(self, uri):
374 if uri == '/': return uri
375 return self.final_slashes_re.sub('', uri)
376
377 def _normalizeParameters(self, parameters):
378 # No parameters
379 if parameters == '?': return None
380 return parameters
381
382 def _decodeHTTPRequest(self, hit):
383 if not 'request' in hit.keys(): return False
384
385 groups = self.http_request_extracted.match(hit['request'])
386
387 if groups:
388 hit['extract_request'] = groups.groupdict("")
389 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
390 if uri_groups:
391 d = uri_groups.groupdict("")
392 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
393 if 'extract_parameters' in d.keys():
394 parameters = self._normalizeParameters(d['extract_parameters'])
395 if parameters:
396 hit['extract_request']['extract_parameters'] = parameters
397 else:
398 self.logger.warning("Bad request extraction %s" % (hit['request']))
399 return False
400
401 if hit['http_referer']:
402 referer_groups = self.uri_re.match(hit['http_referer'])
403 if referer_groups:
404 hit['extract_referer'] = referer_groups.groupdict("")
405 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
406 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
407 return True
408
409 def _decodeTime(self, hit):
410 try:
411 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
412 except ValueError, e:
413 if sys.version_info < (3, 2):
414 # Try without UTC value at the end (%z not recognized)
415 gmt_offset_str = hit['time_local'][-5:]
416 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
417 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
418 gmt_offset = gmt_offset_hours + gmt_offset_minutes
419 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
420 # if gmt_offset_str[0] == '-':
421 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
422 # else:
423 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
424 else:
425 raise e
426 return hit['time_decoded']
427
428 def getDisplayIndex(self):
429 cur_time = self.meta_infos['last_time']
430 filename = self.getCurDisplayPath('index.html')
431
432 return self.display.getPage(filename)
433
434 def _generateDisplayDaysStats(self):
435 cur_time = self.meta_infos['last_time']
436 title = createCurTitle(self, self._('Statistics'))
437 filename = self.getCurDisplayPath('index.html')
438 self.logger.info('==> Generate display (%s)' % (filename))
439 page = self.display.createPage(title, filename, conf.css_path)
440 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
441 page.appendBlock(link)
442
443 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
444 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
445 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
446 nb_visits = 0
447 nb_days = 0
448 for i in range(1, nb_month_days+1):
449 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
450 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
451 if i in self.current_analysis['days_stats'].keys():
452 stats = self.current_analysis['days_stats'][i]
453 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
454 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
455 nb_visits += stats['nb_visits']
456 nb_days += 1
457 else:
458 row = [full_day, 0, 0, 0, 0, 0]
459 days.appendRow(row)
460 days.setCellValue(i-1, 4, bytesToStr(row[4]))
461 days.setCellValue(i-1, 5, bytesToStr(row[5]))
462 days.appendShortTitle(day)
463 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
464 week_day = adate.weekday()
465 if week_day == 5 or week_day == 6:
466 days.setRowCSSClass(i-1, 'iwla_weekend')
467 if adate == date.today():
468 css = days.getCellCSSClass(i-1, 0)
469 if css: css = '%s %s' % (css, 'iwla_curday')
470 else: css = 'iwla_curday'
471 days.setCellCSSClass(i-1, 0, css)
472
473 stats = self.current_analysis['month_stats']
474
475 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
476 if nb_days:
477 average_row = map(lambda(v): int(v/nb_days), row)
478 else:
479 average_row = map(lambda(v): 0, row)
480
481 average_row[0] = self._('Average')
482 average_row[4] = bytesToStr(average_row[4])
483 average_row[5] = bytesToStr(average_row[5])
484 days.appendRow(average_row)
485
486 row[0] = self._('Total')
487 row[4] = bytesToStr(row[4])
488 row[5] = bytesToStr(row[5])
489 days.appendRow(row)
490 page.appendBlock(days)
491 self.display.addPage(page)
492
493 def _generateDisplayMonthStats(self, page, year, month_stats):
494 cur_time = time.localtime()
495 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
496 title = '%s %d' % (self._('Summary'), year)
497 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
498 graph_cols=range(1,7)
499 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
500 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
501 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
502 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
503 total = [0] * len(cols)
504 for i in range(1, 13):
505 month = '%s<br/>%d' % (months_name[i], year)
506 full_month = '%s %d' % (months_name[i], year)
507 if i in month_stats.keys():
508 stats = month_stats[i]
509 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
510 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
511 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
512 for j in graph_cols:
513 total[j] += row[j]
514 else:
515 row = [full_month, 0, 0, 0, 0, 0, 0, '']
516 months.appendRow(row)
517 months.setCellValue(i-1, 5, bytesToStr(row[5]))
518 months.setCellValue(i-1, 6, bytesToStr(row[6]))
519 months.appendShortTitle(month)
520 months_.appendRow(row[:-1])
521 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
522 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
523 months_.appendShortTitle(month)
524 if year == cur_time.tm_year and i == cur_time.tm_mon:
525 css = months.getCellCSSClass(i-1, 0)
526 if css: css = '%s %s' % (css, 'iwla_curday')
527 else: css = 'iwla_curday'
528 months.setCellCSSClass(i-1, 0, css)
529 months_.setCellCSSClass(i-1, 0, css)
530
531 total[0] = self._('Total')
532 total[5] = bytesToStr(total[5])
533 total[6] = bytesToStr(total[6])
534 total[7] = u''
535 months.appendRow(total)
536 page.appendBlock(months)
537
538 months_.appendRow(total[:-1])
539 filename = '%d/_stats.html' % (year)
540 page_ = self.display.createPage(u'', filename, conf.css_path)
541 page_.appendBlock(months_)
542 page_.build(conf.DISPLAY_ROOT, False)
543
544 def _generateDisplayWholeMonthStats(self):
545 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
546 filename = 'index.html'
547
548 self.logger.info('==> Generate main page (%s)' % (filename))
549
550 page = self.display.createPage(title, filename, conf.css_path)
551
552 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
553 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
554 duration = datetime.now() - self.start_time
555 duration = time.gmtime(duration.seconds)
556 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
557 if duration.tm_hour:
558 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
559 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
560 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
561
562 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
563 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
564
565 self.display.addPage(page)
566
567 def _compressFile(self, build_time, root, filename):
568 path = os.path.join(root, filename)
569 gz_path = path + '.gz'
570
571 self.logger.debug('Compress %s => %s' % (path, gz_path))
572
573 if not os.path.exists(gz_path) or\
574 os.stat(path).st_mtime >= build_time:
575 if self.dry_run: return
576 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
577 f_out.write(f_in.read())
578
579 def _compressFiles(self, build_time, root):
580 if not conf.compress_output_files: return
581 for rootdir, subdirs, files in os.walk(root, followlinks=True):
582 for f in files:
583 for ext in conf.compress_output_files:
584 if f.endswith(ext):
585 self._compressFile(build_time, rootdir, f)
586 break
587
588 def _generateDisplay(self):
589 self._generateDisplayDaysStats()
590 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
591 self._generateDisplayWholeMonthStats()
592 build_time = time.mktime(time.localtime())
593 self.display.build(conf.DISPLAY_ROOT)
594 self._compressFiles(build_time, conf.DISPLAY_ROOT)
595
596 def _createEmptyStats(self):
597 stats = {}
598 stats['viewed_bandwidth'] = 0
599 stats['not_viewed_bandwidth'] = 0
600 stats['viewed_pages'] = 0
601 stats['viewed_hits'] = 0
602 stats['nb_visits'] = 0
603
604 return stats
605
606 def _generateMonthStats(self):
607 self._clearDisplay()
608
609 visits = self.current_analysis['visits']
610
611 stats = self._createEmptyStats()
612 for (day, stat) in self.current_analysis['days_stats'].items():
613 for k in stats.keys():
614 stats[k] += stat[k]
615
616 duplicated_stats = {k:v for (k,v) in stats.items()}
617
618 cur_time = self.meta_infos['last_time']
619 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
620 self.logger.info(stats)
621
622 if not 'month_stats' in self.current_analysis.keys():
623 self.current_analysis['month_stats'] = stats
624 else:
625 for (k,v) in stats.items():
626 self.current_analysis['month_stats'][k] = v
627
628 self.valid_visitors = {}
629 for (k,v) in visits.items():
630 if self.isValidVisitor(v):
631 self.valid_visitors[k] = v
632
633 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
634
635 self._callPlugins(conf.POST_HOOK_DIRECTORY)
636
637 if args.display_only:
638 self._generateDisplay()
639 return
640
641 path = self.getDBFilename(cur_time)
642
643 self.logger.info("==> Serialize to %s" % (path))
644 self._serialize(self.current_analysis, path)
645
646 # Save month stats
647 year = cur_time.tm_year
648 month = cur_time.tm_mon
649 if not 'stats' in self.meta_infos.keys():
650 self.meta_infos['stats'] = {}
651 if not year in self.meta_infos['stats'].keys():
652 self.meta_infos['stats'][year] = {}
653 self.meta_infos['stats'][year][month] = duplicated_stats
654
655 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
656 self._serialize(self.meta_infos, conf.META_PATH)
657
658 self._generateDisplay()
659
660 def _generateDayStats(self):
661 if args.display_only:
662 return
663
664 visits = self.current_analysis['visits']
665 cur_time = self.meta_infos['last_time']
666
667 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
668
669 stats = self._createEmptyStats()
670
671 day = cur_time.tm_mday
672 for (k, super_hit) in visits.items():
673 if super_hit['last_access'].tm_mday != day:
674 continue
675 if super_hit['robot']:
676 stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
677 continue
678 stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
679 stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
680 stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
681 if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
682 super_hit['viewed_pages'].get(day, 0)):
683 stats['nb_visits'] += 1
684
685 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
686 self.logger.info(stats)
687
688 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
689
690 def _newHit(self, hit):
691 if not self.domain_name_re.match(hit['server_name']):
692 self.logger.debug("Not in domain %s" % (hit))
693 return False
694
695 t = self._decodeTime(hit)
696
697 cur_time = self.meta_infos['last_time']
698
699 if cur_time == None:
700 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
701 self.analyse_started = True
702 else:
703 if not self.analyse_started and\
704 time.mktime(t) <= time.mktime(cur_time):
705 self.logger.debug("Not in time")
706 return False
707 self.analyse_started = True
708 if t < cur_time: # Don't accept past hits
709 return False
710 if cur_time.tm_mon != t.tm_mon:
711 self._generateDayStats()
712 self._generateMonthStats()
713 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
714 elif cur_time.tm_mday != t.tm_mday:
715 self._generateDayStats()
716
717 self.meta_infos['last_time'] = t
718
719 if not self.meta_infos['start_analysis_time']:
720 self.meta_infos['start_analysis_time'] = t
721
722 if not self._decodeHTTPRequest(hit): return False
723
724 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
725 return False
726
727 for k in hit.keys():
728 if hit[k] == '-' or hit[k] == '*':
729 hit[k] = ''
730
731 self._appendHit(hit)
732
733 return True
734
735 def _reset(self):
736 reset_time = time.strptime(self.args.reset, '%m/%Y')
737
738 self.logger.info('Reset time')
739 self.logger.info(reset_time)
740
741 self.meta_infos['last_time'] = reset_time
742
743 cur_time = time.localtime()
744 year = reset_time.tm_year
745 while year < cur_time.tm_year:
746 db_path = os.path.join(conf.DB_ROOT, str(year))
747 if os.path.exists(db_path): shutil.rmtree(db_path)
748 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
749 if os.path.exists(output_path): shutil.rmtree(output_path)
750 year += 1
751 month = reset_time.tm_mon
752 while month <= cur_time.tm_mon:
753 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
754 if os.path.exists(db_path): shutil.rmtree(db_path)
755 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
756 if os.path.exists(output_path): shutil.rmtree(output_path)
757 month += 1
758
759 def start(self, _file, args):
760 self.args = args
761 self.start_time = datetime.now()
762
763 self.logger.info('==> Load previous database')
764
765 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
766 if self.meta_infos['last_time']:
767 if args.reset:
768 self._reset()
769 self.logger.info('Last time')
770 self.logger.info(self.meta_infos['last_time'])
771 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
772 else:
773 self._clearVisits()
774
775 self.meta_infos['start_analysis_time'] = None
776
777 self.cache_plugins = preloadPlugins(self.plugins, self)
778
779 self.logger.info('==> Analysing log')
780
781 for l in _file:
782 # print "line " + l
783
784 groups = self.log_re.match(l)
785
786 if groups:
787 self._newHit(groups.groupdict(""))
788 else:
789 self.logger.warning("No match for %s" % (l))
790 #break
791
792 if self.analyse_started:
793 self._generateDayStats()
794 self._generateMonthStats()
795 del self.meta_infos['start_analysis_time']
796 else:
797 self.logger.info('==> Analyse not started : nothing new')
798
799
800class FileIter(object):
801 def __init__(self, filenames):
802 self.filenames = [f for f in filenames.split(',') if f]
803 for f in self.filenames:
804 if not os.path.exists(f):
805 print 'No such file \'%s\'' % (f)
806 sys.exit(-1)
807 self.cur_file = None
808 self._openNextFile()
809
810 def __iter__(self):
811 return self
812
813 def __next__(self):
814 return self.next()
815
816 def _openNextFile(self):
817 if self.cur_file:
818 self.cur_file.close()
819 self.cur_file = None
820 if not self.filenames:
821 raise StopIteration()
822 filename = self.filenames.pop(0)
823 if filename.endswith('gz'):
824 self.cur_file = gzip.open(filename, 'r')
825 else:
826 self.cur_file = open(filename)
827
828 def next(self):
829 l = self.cur_file.readline()
830 if not l:
831 self._openNextFile()
832 l = self.cur_file.readline()
833 return l[:-1]
834
835if __name__ == '__main__':
836 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
837
838 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
839 default=False,
840 help='Clean output before starting')
841
842 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
843 default=False,
844 help='Read data from stdin instead of conf.analyzed_filename')
845
846 parser.add_argument('-f', '--file', dest='file',
847 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
848
849 parser.add_argument('-d', '--log-level', dest='loglevel',
850 default='INFO', type=str,
851 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
852
853 parser.add_argument('-r', '--reset', dest='reset',
854 default=False,
855 help='Reset analysis to a specific date (month/year)')
856
857 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
858 default=False,
859 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
860
861 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
862 default=False,
863 help='Only generate display')
864
865 parser.add_argument('-D', '--dry-run', dest='dry_run', action='store_true',
866 default=False,
867 help='Process log but don\'t write files (database and HTML) to disk')
868
869 args = parser.parse_args()
870
871 # Load user conf
872 for (k,v) in user_conf.__dict__.items():
873 if k.endswith('_append'):
874 new_k = k[:-7]
875 if new_k in dir(conf):
876 if type(conf.__dict__[new_k]) == list:
877 if type(v) == list:
878 conf.__dict__[new_k] += v
879 else:
880 conf.__dict__[new_k].append(v)
881 else:
882 print("Error %s is not a list" % (new_k))
883 else:
884 print("Error %s doesn't exists in default conf" % (new_k))
885 else:
886 conf.__dict__.update({k:v})
887
888 if args.clean_output and not args.dry_run:
889 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
890 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
891
892 loglevel = getattr(logging, args.loglevel.upper(), None)
893 if not isinstance(loglevel, int):
894 raise ValueError('Invalid log level: %s' % (args.loglevel))
895
896 iwla = IWLA(loglevel, args.dry_run)
897
898 required_conf = ['analyzed_filename', 'domain_name']
899 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
900 sys.exit(0)
901
902 if args.stdin:
903 iwla.start(sys.stdin, args)
904 else:
905 filename = args.file or conf.analyzed_filename
906 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags