iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages
99 viewed_hits
100 not_viewed_pages
101 not_viewed_hits
102 bandwidth
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.5-dev'
135
136 def __init__(self, logLevel):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144
145 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
146 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
147 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
148 self.log_re = re.compile(self.log_format_extracted)
149 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
150 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
151 self.final_slashes_re = re.compile(r'/+$')
152 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
153 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
154 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
155
156 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
157 self.logger = logging.getLogger(self.__class__.__name__)
158 self.logger.info('==> Start')
159 try:
160 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
161 self.logger.info('\tUsing locale %s' % (conf.locale))
162 except IOError:
163 t = gettext.NullTranslations()
164 self.logger.info('\tUsing default locale en_EN')
165 self._ = t.ugettext
166
167 def getVersion(self):
168 return IWLA.IWLA_VERSION
169
170 def getConfValue(self, key, default=None):
171 if not key in dir(conf):
172 return default
173 else:
174 return conf.__dict__[key]
175
176 def _clearVisits(self):
177 self.current_analysis = {
178 'days_stats' : {},
179 'month_stats' : {},
180 'visits' : {}
181 }
182 self.valid_visitors = None
183 return self.current_analysis
184
185 def getDaysStats(self):
186 return self.current_analysis['days_stats']
187
188 def getMonthStats(self):
189 return self.current_analysis['month_stats']
190
191 def getCurrentVisits(self):
192 return self.current_analysis['visits']
193
194 def getValidVisitors(self):
195 return self.valid_visitors
196
197 def getDisplay(self):
198 return self.display
199
200 def getCurTime(self):
201 return self.meta_infos['last_time']
202
203 def getStartAnalysisTime(self):
204 return self.meta_infos['start_analysis_time']
205
206 def isValidForCurrentAnalysis(self, request):
207 cur_time = self.meta_infos['start_analysis_time']
208 # Analyse not started
209 if not cur_time: return False
210 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
211
212 def hasBeenViewed(self, request):
213 return int(request['status']) in conf.viewed_http_codes
214
215 def getCurDisplayPath(self, filename):
216 cur_time = self.meta_infos['last_time']
217 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
218
219 def getResourcesPath(self):
220 return conf.resources_path
221
222 def getCSSPath(self):
223 return conf.css_path
224
225 def _clearMeta(self):
226 self.meta_infos = {
227 'last_time' : None,
228 'start_analysis_time' : None
229 }
230 return self.meta_infos
231
232 def _clearDisplay(self):
233 self.display.clear()
234return self.display
235
236 def getDBFilename(self, time):
237 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
238
239 def _openDB(self, filename, prot='r'):
240 if self.args.dont_compress:
241 return open(filename, prot)
242 else:
243 return gzip.open(filename, prot)
244
245 def _serialize(self, obj, filename):
246 base = os.path.dirname(filename)
247 if not os.path.exists(base):
248 os.makedirs(base)
249
250 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
251 pickle.dump(obj, f)
252 os.fsync(f)
253 f.seek(0)
254 fzip.write(f.read())
255 os.remove(filename + '.tmp')
256
257 def _deserialize(self, filename):
258 if not os.path.exists(filename):
259 return None
260
261 with self._openDB(filename) as f:
262 return pickle.load(f)
263 return None
264
265 def _callPlugins(self, target_root, *args):
266 self.logger.info('==> Call plugins (%s)' % (target_root))
267 for (root, plugins) in self.plugins:
268 if root != target_root: continue
269 for p in plugins:
270 mod = self.cache_plugins.get(root + '.' + p, None)
271 if mod:
272 self.logger.info('\t%s' % (p))
273 mod.hook(*args)
274
275 def isPage(self, request):
276 self.logger.debug("Is page %s" % (request))
277 for e in conf.pages_extensions:
278 if request.endswith(e):
279 self.logger.debug("True")
280 return True
281 self.logger.debug("False")
282 return False
283
284 def isMultimediaFile(self, request):
285 self.logger.debug("Is multimedia %s" % (request))
286 for e in conf.multimedia_files:
287 if request.endswith(e):
288 self.logger.debug("True")
289 return True
290 self.logger.debug("False")
291 return False
292
293 def isValidVisitor(self, hit):
294 if hit['robot']: return False
295 if not (conf.count_hit_only_visitors or\
296 hit['viewed_pages']):
297 return False
298 return True
299
300 def isRobot(self, hit):
301 return hit['robot']
302
303 def _appendHit(self, hit):
304 remote_addr = hit['remote_addr']
305
306 if not remote_addr: return
307
308 if not remote_addr in self.current_analysis['visits'].keys():
309 self._createVisitor(hit)
310
311 super_hit = self.current_analysis['visits'][remote_addr]
312 super_hit['requests'].append(hit)
313 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
314 super_hit['last_access'] = self.meta_infos['last_time']
315
316 request = hit['extract_request']
317
318 uri = request.get('extract_uri', request['http_uri'])
319
320 hit['is_page'] = self.isPage(uri)
321
322 if super_hit['robot'] or\
323 not self.hasBeenViewed(hit):
324 page_key = 'not_viewed_pages'
325 hit_key = 'not_viewed_hits'
326 else:
327 page_key = 'viewed_pages'
328 hit_key = 'viewed_hits'
329
330 if hit['is_page']:
331 super_hit[page_key] += 1
332 else:
333 super_hit[hit_key] += 1
334
335 def _createVisitor(self, hit):
336 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
337 super_hit['remote_addr'] = hit['remote_addr']
338 super_hit['remote_ip'] = hit['remote_addr']
339 super_hit['viewed_pages'] = 0
340 super_hit['viewed_hits'] = 0
341 super_hit['not_viewed_pages'] = 0
342 super_hit['not_viewed_hits'] = 0
343 super_hit['bandwidth'] = 0
344 super_hit['last_access'] = self.meta_infos['last_time']
345 super_hit['requests'] = []
346 super_hit['robot'] = False
347 super_hit['hit_only'] = 0
348
349 def _normalizeURI(self, uri):
350 if uri == '/': return uri
351 uri = self.final_slashes_re.sub('/', uri)
352 return uri
353
354 def _removeFinalSlashes(self, uri):
355 if uri == '/': return uri
356 return self.final_slashes_re.sub('', uri)
357
358 def _normalizeParameters(self, parameters):
359 # No parameters
360 if parameters == '?': return None
361 return parameters
362
363 def _decodeHTTPRequest(self, hit):
364 if not 'request' in hit.keys(): return False
365
366 groups = self.http_request_extracted.match(hit['request'])
367
368 if groups:
369 hit['extract_request'] = groups.groupdict()
370 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
371 if uri_groups:
372 d = uri_groups.groupdict()
373 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
374 if 'extract_parameters' in d.keys():
375 parameters = self._normalizeParameters(d['extract_parameters'])
376 if parameters:
377 hit['extract_request']['extract_parameters'] = parameters
378 else:
379 self.logger.warning("Bad request extraction %s" % (hit['request']))
380 return False
381
382 if hit['http_referer']:
383 referer_groups = self.uri_re.match(hit['http_referer'])
384 if referer_groups:
385 hit['extract_referer'] = referer_groups.groupdict()
386 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
387 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
388 return True
389
390 def _decodeTime(self, hit):
391 try:
392 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
393 except ValueError, e:
394 if sys.version_info < (3, 2):
395 # Try without UTC value at the end (%z not recognized)
396 gmt_offset_str = hit['time_local'][-5:]
397 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
398 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
399 gmt_offset = gmt_offset_hours + gmt_offset_minutes
400 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
401 # if gmt_offset_str[0] == '-':
402 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
403 # else:
404 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
405 else:
406 raise e
407 return hit['time_decoded']
408
409 def getDisplayIndex(self):
410 cur_time = self.meta_infos['last_time']
411 filename = self.getCurDisplayPath('index.html')
412
413 return self.display.getPage(filename)
414
415 def _generateDisplayDaysStats(self):
416 cur_time = self.meta_infos['last_time']
417 title = createCurTitle(self, self._('Statistics'))
418 filename = self.getCurDisplayPath('index.html')
419 self.logger.info('==> Generate display (%s)' % (filename))
420 page = self.display.createPage(title, filename, conf.css_path)
421 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
422 page.appendBlock(link)
423
424 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
425 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
426 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
427 nb_visits = 0
428 nb_days = 0
429 for i in range(1, nb_month_days+1):
430 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
431 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
432 if i in self.current_analysis['days_stats'].keys():
433 stats = self.current_analysis['days_stats'][i]
434 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
435 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
436 nb_visits += stats['nb_visits']
437 nb_days += 1
438 else:
439 row = [full_day, 0, 0, 0, 0, 0]
440 days.appendRow(row)
441 days.setCellValue(i-1, 4, bytesToStr(row[4]))
442 days.setCellValue(i-1, 5, bytesToStr(row[5]))
443 days.appendShortTitle(day)
444 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
445 week_day = adate.weekday()
446 if week_day == 5 or week_day == 6:
447 days.setRowCSSClass(i-1, 'iwla_weekend')
448 if adate == date.today():
449 css = days.getCellCSSClass(i-1, 0)
450 if css: css = '%s %s' % (css, 'iwla_curday')
451 else: css = 'iwla_curday'
452 days.setCellCSSClass(i-1, 0, css)
453
454 stats = self.current_analysis['month_stats']
455
456 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
457 if nb_days:
458 average_row = map(lambda(v): int(v/nb_days), row)
459 else:
460 average_row = map(lambda(v): 0, row)
461
462 average_row[0] = self._('Average')
463 average_row[4] = bytesToStr(average_row[4])
464 average_row[5] = bytesToStr(average_row[5])
465 days.appendRow(average_row)
466
467 row[0] = self._('Total')
468 row[4] = bytesToStr(row[4])
469 row[5] = bytesToStr(row[5])
470 days.appendRow(row)
471 page.appendBlock(days)
472 self.display.addPage(page)
473
474 def _generateDisplayMonthStats(self, page, year, month_stats):
475 cur_time = time.localtime()
476 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
477 title = '%s %d' % (self._('Summary'), year)
478 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
479 graph_cols=range(1,7)
480 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
481 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
482 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
483 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
484 total = [0] * len(cols)
485 for i in range(1, 13):
486 month = '%s<br/>%d' % (months_name[i], year)
487 full_month = '%s %d' % (months_name[i], year)
488 if i in month_stats.keys():
489 stats = month_stats[i]
490 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
491 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
492 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
493 for j in graph_cols:
494 total[j] += row[j]
495 else:
496 row = [full_month, 0, 0, 0, 0, 0, 0, '']
497 months.appendRow(row)
498 months.setCellValue(i-1, 5, bytesToStr(row[5]))
499 months.setCellValue(i-1, 6, bytesToStr(row[6]))
500 months.appendShortTitle(month)
501 months_.appendRow(row[:-1])
502 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
503 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
504 months_.appendShortTitle(month)
505 if year == cur_time.tm_year and i == cur_time.tm_mon:
506 css = months.getCellCSSClass(i-1, 0)
507 if css: css = '%s %s' % (css, 'iwla_curday')
508 else: css = 'iwla_curday'
509 months.setCellCSSClass(i-1, 0, css)
510 months_.setCellCSSClass(i-1, 0, css)
511
512 total[0] = self._('Total')
513 total[5] = bytesToStr(total[5])
514 total[6] = bytesToStr(total[6])
515 total[7] = u''
516 months.appendRow(total)
517 page.appendBlock(months)
518
519 months_.appendRow(total[:-1])
520 filename = '%d/_stats.html' % (year)
521 page_ = self.display.createPage(u'', filename, conf.css_path)
522 page_.appendBlock(months_)
523 page_.build(conf.DISPLAY_ROOT, False)
524
525 def _generateDisplayWholeMonthStats(self):
526 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
527 filename = 'index.html'
528
529 self.logger.info('==> Generate main page (%s)' % (filename))
530
531 page = self.display.createPage(title, filename, conf.css_path)
532
533 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
534 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
535 duration = datetime.now() - self.start_time
536 duration = time.gmtime(duration.seconds)
537 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
538 if duration.tm_hour:
539 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
540 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
541 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
542
543 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
544 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
545
546 self.display.addPage(page)
547
548 def _compressFile(self, build_time, root, filename):
549 path = os.path.join(root, filename)
550 gz_path = path + '.gz'
551
552 self.logger.debug('Compress %s => %s' % (path, gz_path))
553
554 if not os.path.exists(gz_path) or\
555 os.stat(path).st_mtime >= build_time:
556 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
557 f_out.write(f_in.read())
558
559 def _compressFiles(self, build_time, root):
560 if not conf.compress_output_files: return
561 for rootdir, subdirs, files in os.walk(root, followlinks=True):
562 for f in files:
563 for ext in conf.compress_output_files:
564 if f.endswith(ext):
565 self._compressFile(build_time, rootdir, f)
566 break
567
568 def _generateDisplay(self):
569 self._generateDisplayDaysStats()
570 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
571 self._generateDisplayWholeMonthStats()
572 build_time = time.mktime(time.localtime())
573 self.display.build(conf.DISPLAY_ROOT)
574 self._compressFiles(build_time, conf.DISPLAY_ROOT)
575
576 def _createEmptyStats(self):
577 stats = {}
578 stats['viewed_bandwidth'] = 0
579 stats['not_viewed_bandwidth'] = 0
580 stats['viewed_pages'] = 0
581 stats['viewed_hits'] = 0
582 stats['nb_visits'] = 0
583
584 return stats
585
586 def _generateMonthStats(self):
587 self._clearDisplay()
588
589 visits = self.current_analysis['visits']
590
591 stats = self._createEmptyStats()
592 for (day, stat) in self.current_analysis['days_stats'].items():
593 for k in stats.keys():
594 stats[k] += stat[k]
595
596 duplicated_stats = {k:v for (k,v) in stats.items()}
597
598 cur_time = self.meta_infos['last_time']
599 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
600 self.logger.info(stats)
601
602 if not 'month_stats' in self.current_analysis.keys():
603 self.current_analysis['month_stats'] = stats
604 else:
605 for (k,v) in stats.items():
606 self.current_analysis['month_stats'][k] = v
607
608 self.valid_visitors = {}
609 for (k,v) in visits.items():
610 if self.isValidVisitor(v):
611 self.valid_visitors[k] = v
612
613 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
614
615 if args.display_only:
616 self._generateDisplay()
617 return
618
619 self._callPlugins(conf.POST_HOOK_DIRECTORY)
620
621 path = self.getDBFilename(cur_time)
622 if os.path.exists(path):
623 os.remove(path)
624
625 self.logger.info("==> Serialize to %s" % (path))
626 self._serialize(self.current_analysis, path)
627
628 # Save month stats
629 year = cur_time.tm_year
630 month = cur_time.tm_mon
631 if not 'stats' in self.meta_infos.keys():
632 self.meta_infos['stats'] = {}
633 if not year in self.meta_infos['stats'].keys():
634 self.meta_infos['stats'][year] = {}
635 self.meta_infos['stats'][year][month] = duplicated_stats
636
637 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
638 self._serialize(self.meta_infos, conf.META_PATH)
639
640 self._generateDisplay()
641
642 def _generateDayStats(self):
643 if args.display_only:
644 return
645
646 visits = self.current_analysis['visits']
647 cur_time = self.meta_infos['last_time']
648
649 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
650
651 stats = self._createEmptyStats()
652
653 for (k, super_hit) in visits.items():
654 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
655 continue
656 viewed_pages = False
657 for hit in super_hit['requests'][::-1]:
658 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
659 break
660 if super_hit['robot'] or\
661 not self.hasBeenViewed(hit):
662 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
663 continue
664 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
665 if hit['is_page']:
666 stats['viewed_pages'] += 1
667 viewed_pages = True
668 else:
669 stats['viewed_hits'] += 1
670 if (conf.count_hit_only_visitors or\
671 viewed_pages) and\
672 not super_hit['robot']:
673 stats['nb_visits'] += 1
674
675 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
676 self.logger.info(stats)
677
678 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
679
680 def _newHit(self, hit):
681 if not self.domain_name_re.match(hit['server_name']):
682 self.logger.debug("Not in domain %s" % (hit))
683 return False
684
685 t = self._decodeTime(hit)
686
687 cur_time = self.meta_infos['last_time']
688
689 if cur_time == None:
690 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
691 self.analyse_started = True
692 else:
693 if not self.analyse_started and\
694 time.mktime(t) <= time.mktime(cur_time):
695 self.logger.debug("Not in time")
696 return False
697 self.analyse_started = True
698 if cur_time.tm_mon != t.tm_mon:
699 self._generateDayStats()
700 self._generateMonthStats()
701 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
702 elif cur_time.tm_mday != t.tm_mday:
703 self._generateDayStats()
704
705 self.meta_infos['last_time'] = t
706
707 if not self.meta_infos['start_analysis_time']:
708 self.meta_infos['start_analysis_time'] = t
709
710 if not self._decodeHTTPRequest(hit): return False
711
712 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
713 return False
714
715 for k in hit.keys():
716 if hit[k] == '-' or hit[k] == '*':
717 hit[k] = ''
718
719 self._appendHit(hit)
720
721 return True
722
723 def _reset(self):
724 reset_time = time.strptime(self.args.reset, '%m/%Y')
725
726 self.logger.info('Reset time')
727 self.logger.info(reset_time)
728
729 self.meta_infos['last_time'] = reset_time
730
731 cur_time = time.localtime()
732 year = reset_time.tm_year
733 while year < cur_time.tm_year:
734 db_path = os.path.join(conf.DB_ROOT, str(year))
735 if os.path.exists(db_path): shutil.rmtree(db_path)
736 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
737 if os.path.exists(output_path): shutil.rmtree(output_path)
738 year += 1
739 month = reset_time.tm_mon
740 while month <= cur_time.tm_mon:
741 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
742 if os.path.exists(db_path): shutil.rmtree(db_path)
743 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
744 if os.path.exists(output_path): shutil.rmtree(output_path)
745 month += 1
746
747 def start(self, _file, args):
748 self.args = args
749 self.start_time = datetime.now()
750
751 self.logger.info('==> Load previous database')
752
753 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
754 if self.meta_infos['last_time']:
755 if args.reset:
756 self._reset()
757 self.logger.info('Last time')
758 self.logger.info(self.meta_infos['last_time'])
759 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
760 else:
761 self._clearVisits()
762
763 self.meta_infos['start_analysis_time'] = None
764
765 self.cache_plugins = preloadPlugins(self.plugins, self)
766
767 self.logger.info('==> Analysing log')
768
769 for l in _file:
770 # print "line " + l
771
772 groups = self.log_re.match(l)
773
774 if groups:
775 self._newHit(groups.groupdict())
776 else:
777 self.logger.warning("No match for %s" % (l))
778 #break
779
780 if self.analyse_started:
781 self._generateDayStats()
782 self._generateMonthStats()
783 del self.meta_infos['start_analysis_time']
784 else:
785 self.logger.info('==> Analyse not started : nothing new')
786
787
788class FileIter(object):
789 def __init__(self, filenames):
790 self.filenames = [f for f in filenames.split(',') if f]
791 for f in self.filenames:
792 if not os.path.exists(f):
793 print 'No such file \'%s\'' % (f)
794 sys.exit(-1)
795 self.cur_file = None
796 self._openNextFile()
797
798 def __iter__(self):
799 return self
800
801 def __next__(self):
802 return self.next()
803
804 def _openNextFile(self):
805 if self.cur_file:
806 self.cur_file.close()
807 self.cur_file = None
808 if not self.filenames:
809 raise StopIteration()
810 filename = self.filenames.pop(0)
811 if filename.endswith('gz'):
812 self.cur_file = gzip.open(filename, 'r')
813 else:
814 self.cur_file = open(filename)
815
816 def next(self):
817 l = self.cur_file.readline()
818 if not l:
819 self._openNextFile()
820 l = self.cur_file.readline()
821 return l[:-1]
822
823if __name__ == '__main__':
824 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
825
826 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
827 default=False,
828 help='Clean output before starting')
829
830 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
831 default=False,
832 help='Read data from stdin instead of conf.analyzed_filename')
833
834 parser.add_argument('-f', '--file', dest='file',
835 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
836
837 parser.add_argument('-d', '--log-level', dest='loglevel',
838 default='INFO', type=str,
839 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
840
841 parser.add_argument('-r', '--reset', dest='reset',
842 default=False,
843 help='Reset analysis to a specific date (month/year)')
844
845 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
846 default=False,
847 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
848
849 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
850 default=False,
851 help='Only generate display')
852
853 args = parser.parse_args()
854
855 # Load user conf
856 for (k,v) in user_conf.__dict__.items():
857 if k.endswith('_append'):
858 new_k = k[:-7]
859 if new_k in dir(conf):
860 if type(conf.__dict__[new_k]) == list:
861 if type(v) == list:
862 conf.__dict__[new_k] += v
863 else:
864 conf.__dict__[new_k].append(v)
865 else:
866 print("Error %s is not a list" % (new_k))
867 else:
868 print("Error %s doesn't exists in default conf" % (new_k))
869 else:
870 conf.__dict__.update({k:v})
871
872 if args.clean_output:
873 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
874 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
875
876 loglevel = getattr(logging, args.loglevel.upper(), None)
877 if not isinstance(loglevel, int):
878 raise ValueError('Invalid log level: %s' % (args.loglevel))
879
880 iwla = IWLA(loglevel)
881
882 required_conf = ['analyzed_filename', 'domain_name']
883 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
884 sys.exit(0)
885
886 if args.stdin:
887 iwla.start(sys.stdin, args)
888 else:
889 filename = args.file or conf.analyzed_filename
890 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags