iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages
99 viewed_hits
100 not_viewed_pages
101 not_viewed_hits
102 bandwidth
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.4-dev'
135
136 def __init__(self, logLevel):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144
145 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
146 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
147 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
148 self.log_re = re.compile(self.log_format_extracted)
149 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
150 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
151 self.final_slashes_re = re.compile(r'/+$')
152 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
153 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
154 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
155
156 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
157 self.logger = logging.getLogger(self.__class__.__name__)
158 self.logger.info('==> Start')
159 try:
160 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
161 self.logger.info('\tUsing locale %s' % (conf.locale))
162 except IOError:
163 t = gettext.NullTranslations()
164 self.logger.info('\tUsing default locale en_EN')
165 self._ = t.ugettext
166
167 def getVersion(self):
168 return IWLA.IWLA_VERSION
169
170 def getConfValue(self, key, default=None):
171 if not key in dir(conf):
172 return default
173 else:
174 return conf.__dict__[key]
175
176 def _clearVisits(self):
177 self.current_analysis = {
178 'days_stats' : {},
179 'month_stats' : {},
180 'visits' : {}
181 }
182 self.valid_visitors = None
183 return self.current_analysis
184
185 def getDaysStats(self):
186 return self.current_analysis['days_stats']
187
188 def getMonthStats(self):
189 return self.current_analysis['month_stats']
190
191 def getCurrentVisits(self):
192 return self.current_analysis['visits']
193
194 def getValidVisitors(self):
195 return self.valid_visitors
196
197 def getDisplay(self):
198 return self.display
199
200 def getCurTime(self):
201 return self.meta_infos['last_time']
202
203 def getStartAnalysisTime(self):
204 return self.meta_infos['start_analysis_time']
205
206 def isValidForCurrentAnalysis(self, request):
207 cur_time = self.meta_infos['start_analysis_time']
208 # Analyse not started
209 if not cur_time: return False
210 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
211
212 def hasBeenViewed(self, request):
213 return int(request['status']) in conf.viewed_http_codes
214
215 def getCurDisplayPath(self, filename):
216 cur_time = self.meta_infos['last_time']
217 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
218
219 def getResourcesPath(self):
220 return conf.resources_path
221
222 def getCSSPath(self):
223 return conf.css_path
224
225 def _clearMeta(self):
226 self.meta_infos = {
227 'last_time' : None,
228 'start_analysis_time' : None
229 }
230 return self.meta_infos
231
232 def _clearDisplay(self):
233 self.display.clear()
234return self.display
235
236 def getDBFilename(self, time):
237 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
238
239 def _openDB(self, filename, prot='r'):
240 if self.args.dont_compress:
241 return open(filename, prot)
242 else:
243 return gzip.open(filename, prot)
244
245 def _serialize(self, obj, filename):
246 base = os.path.dirname(filename)
247 if not os.path.exists(base):
248 os.makedirs(base)
249
250 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
251 pickle.dump(obj, f)
252 f.seek(0)
253 fzip.write(f.read())
254 os.remove(filename + '.tmp')
255
256 def _deserialize(self, filename):
257 if not os.path.exists(filename):
258 return None
259
260 with self._openDB(filename) as f:
261 return pickle.load(f)
262 return None
263
264 def _callPlugins(self, target_root, *args):
265 self.logger.info('==> Call plugins (%s)' % (target_root))
266 for (root, plugins) in self.plugins:
267 if root != target_root: continue
268 for p in plugins:
269 mod = self.cache_plugins.get(root + '.' + p, None)
270 if mod:
271 self.logger.info('\t%s' % (p))
272 mod.hook(*args)
273
274 def isPage(self, request):
275 self.logger.debug("Is page %s" % (request))
276 for e in conf.pages_extensions:
277 if request.endswith(e):
278 self.logger.debug("True")
279 return True
280 self.logger.debug("False")
281 return False
282
283 def isMultimediaFile(self, request):
284 self.logger.debug("Is multimedia %s" % (request))
285 for e in conf.multimedia_files:
286 if request.endswith(e):
287 self.logger.debug("True")
288 return True
289 self.logger.debug("False")
290 return False
291
292 def isValidVisitor(self, hit):
293 if hit['robot']: return False
294 if not (conf.count_hit_only_visitors or\
295 hit['viewed_pages']):
296 return False
297 return True
298
299 def _appendHit(self, hit):
300 remote_addr = hit['remote_addr']
301
302 if not remote_addr: return
303
304 if not remote_addr in self.current_analysis['visits'].keys():
305 self._createVisitor(hit)
306
307 super_hit = self.current_analysis['visits'][remote_addr]
308 super_hit['requests'].append(hit)
309 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
310 super_hit['last_access'] = self.meta_infos['last_time']
311
312 request = hit['extract_request']
313
314 uri = request.get('extract_uri', request['http_uri'])
315
316 hit['is_page'] = self.isPage(uri)
317
318 if super_hit['robot'] or\
319 not self.hasBeenViewed(hit):
320 page_key = 'not_viewed_pages'
321 hit_key = 'not_viewed_hits'
322 else:
323 page_key = 'viewed_pages'
324 hit_key = 'viewed_hits'
325
326 if hit['is_page']:
327 super_hit[page_key] += 1
328 else:
329 super_hit[hit_key] += 1
330
331 def _createVisitor(self, hit):
332 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
333 super_hit['remote_addr'] = hit['remote_addr']
334 super_hit['remote_ip'] = hit['remote_addr']
335 super_hit['viewed_pages'] = 0
336 super_hit['viewed_hits'] = 0
337 super_hit['not_viewed_pages'] = 0
338 super_hit['not_viewed_hits'] = 0
339 super_hit['bandwidth'] = 0
340 super_hit['last_access'] = self.meta_infos['last_time']
341 super_hit['requests'] = []
342 super_hit['robot'] = False
343 super_hit['hit_only'] = 0
344
345 def _normalizeURI(self, uri):
346 if uri == '/': return uri
347 uri = self.final_slashes_re.sub('/', uri)
348 return uri
349
350 def _removeFinalSlashes(self, uri):
351 if uri == '/': return uri
352 return self.final_slashes_re.sub('', uri)
353
354 def _normalizeParameters(self, parameters):
355 # No parameters
356 if parameters == '?': return None
357 return parameters
358
359 def _decodeHTTPRequest(self, hit):
360 if not 'request' in hit.keys(): return False
361
362 groups = self.http_request_extracted.match(hit['request'])
363
364 if groups:
365 hit['extract_request'] = groups.groupdict()
366 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
367 if uri_groups:
368 d = uri_groups.groupdict()
369 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
370 if 'extract_parameters' in d.keys():
371 parameters = self._normalizeParameters(d['extract_parameters'])
372 if parameters:
373 hit['extract_request']['extract_parameters'] = parameters
374 else:
375 self.logger.warning("Bad request extraction %s" % (hit['request']))
376 return False
377
378 if hit['http_referer']:
379 referer_groups = self.uri_re.match(hit['http_referer'])
380 if referer_groups:
381 hit['extract_referer'] = referer_groups.groupdict()
382 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
383 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
384 return True
385
386 def _decodeTime(self, hit):
387 try:
388 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
389 except ValueError, e:
390 if sys.version_info < (3, 2):
391 # Try without UTC value at the end (%z not recognized)
392 gmt_offset_str = hit['time_local'][-5:]
393 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
394 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
395 gmt_offset = gmt_offset_hours + gmt_offset_minutes
396 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
397 # if gmt_offset_str[0] == '-':
398 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
399 # else:
400 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
401 else:
402 raise e
403 return hit['time_decoded']
404
405 def getDisplayIndex(self):
406 cur_time = self.meta_infos['last_time']
407 filename = self.getCurDisplayPath('index.html')
408
409 return self.display.getPage(filename)
410
411 def _generateDisplayDaysStats(self):
412 cur_time = self.meta_infos['last_time']
413 title = createCurTitle(self, self._('Statistics'))
414 filename = self.getCurDisplayPath('index.html')
415 self.logger.info('==> Generate display (%s)' % (filename))
416 page = self.display.createPage(title, filename, conf.css_path)
417 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
418 page.appendBlock(link)
419
420 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
421 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
422 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
423 nb_visits = 0
424 nb_days = 0
425 for i in range(1, nb_month_days+1):
426 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
427 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
428 if i in self.current_analysis['days_stats'].keys():
429 stats = self.current_analysis['days_stats'][i]
430 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
431 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
432 nb_visits += stats['nb_visits']
433 nb_days += 1
434 else:
435 row = [full_day, 0, 0, 0, 0, 0]
436 days.appendRow(row)
437 days.setCellValue(i-1, 4, bytesToStr(row[4]))
438 days.setCellValue(i-1, 5, bytesToStr(row[5]))
439 days.appendShortTitle(day)
440 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
441 week_day = adate.weekday()
442 if week_day == 5 or week_day == 6:
443 days.setRowCSSClass(i-1, 'iwla_weekend')
444 if adate == date.today():
445 css = days.getCellCSSClass(i-1, 0)
446 if css: css = '%s %s' % (css, 'iwla_curday')
447 else: css = 'iwla_curday'
448 days.setCellCSSClass(i-1, 0, css)
449
450 stats = self.current_analysis['month_stats']
451
452 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
453 if nb_days:
454 average_row = map(lambda(v): int(v/nb_days), row)
455 else:
456 average_row = map(lambda(v): 0, row)
457
458 average_row[0] = self._('Average')
459 average_row[4] = bytesToStr(average_row[4])
460 average_row[5] = bytesToStr(average_row[5])
461 days.appendRow(average_row)
462
463 row[0] = self._('Total')
464 row[4] = bytesToStr(row[4])
465 row[5] = bytesToStr(row[5])
466 days.appendRow(row)
467 page.appendBlock(days)
468 self.display.addPage(page)
469
470 def _generateDisplayMonthStats(self, page, year, month_stats):
471 cur_time = time.localtime()
472 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
473 title = '%s %d' % (self._('Summary'), year)
474 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
475 graph_cols=range(1,7)
476 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
477 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
478 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
479 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
480 total = [0] * len(cols)
481 for i in range(1, 13):
482 month = '%s<br/>%d' % (months_name[i], year)
483 full_month = '%s %d' % (months_name[i], year)
484 if i in month_stats.keys():
485 stats = month_stats[i]
486 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
487 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
488 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
489 for j in graph_cols:
490 total[j] += row[j]
491 else:
492 row = [full_month, 0, 0, 0, 0, 0, 0, '']
493 months.appendRow(row)
494 months.setCellValue(i-1, 5, bytesToStr(row[5]))
495 months.setCellValue(i-1, 6, bytesToStr(row[6]))
496 months.appendShortTitle(month)
497 months_.appendRow(row[:-1])
498 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
499 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
500 months_.appendShortTitle(month)
501 if year == cur_time.tm_year and i == cur_time.tm_mon:
502 css = months.getCellCSSClass(i-1, 0)
503 if css: css = '%s %s' % (css, 'iwla_curday')
504 else: css = 'iwla_curday'
505 months.setCellCSSClass(i-1, 0, css)
506 months_.setCellCSSClass(i-1, 0, css)
507
508 total[0] = self._('Total')
509 total[5] = bytesToStr(total[5])
510 total[6] = bytesToStr(total[6])
511 total[7] = u''
512 months.appendRow(total)
513 page.appendBlock(months)
514
515 months_.appendRow(total[:-1])
516 filename = '%d/_stats.html' % (year)
517 page_ = self.display.createPage(u'', filename, conf.css_path)
518 page_.appendBlock(months_)
519 page_.build(conf.DISPLAY_ROOT, False)
520
521 def _generateDisplayWholeMonthStats(self):
522 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
523 filename = 'index.html'
524
525 self.logger.info('==> Generate main page (%s)' % (filename))
526
527 page = self.display.createPage(title, filename, conf.css_path)
528
529 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
530 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
531 duration = datetime.now() - self.start_time
532 duration = time.gmtime(duration.seconds)
533 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
534 if duration.tm_hour:
535 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
536 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
537 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
538
539 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
540 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
541
542 self.display.addPage(page)
543
544 def _compressFile(self, build_time, root, filename):
545 path = os.path.join(root, filename)
546 gz_path = path + '.gz'
547
548 self.logger.debug('Compress %s => %s' % (path, gz_path))
549
550 if not os.path.exists(gz_path) or\
551 os.stat(path).st_mtime > build_time:
552 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
553 f_out.write(f_in.read())
554
555 def _compressFiles(self, build_time, root):
556 if not conf.compress_output_files: return
557 for rootdir, subdirs, files in os.walk(root, followlinks=True):
558 for f in files:
559 for ext in conf.compress_output_files:
560 if f.endswith(ext):
561 self._compressFile(build_time, rootdir, f)
562 break
563
564 def _generateDisplay(self):
565 self._generateDisplayDaysStats()
566 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
567 self._generateDisplayWholeMonthStats()
568 build_time = time.localtime()
569 self.display.build(conf.DISPLAY_ROOT)
570 self._compressFiles(build_time, conf.DISPLAY_ROOT)
571
572 def _createEmptyStats(self):
573 stats = {}
574 stats['viewed_bandwidth'] = 0
575 stats['not_viewed_bandwidth'] = 0
576 stats['viewed_pages'] = 0
577 stats['viewed_hits'] = 0
578 stats['nb_visits'] = 0
579
580 return stats
581
582 def _generateMonthStats(self):
583 self._clearDisplay()
584
585 visits = self.current_analysis['visits']
586
587 stats = self._createEmptyStats()
588 for (day, stat) in self.current_analysis['days_stats'].items():
589 for k in stats.keys():
590 stats[k] += stat[k]
591
592 duplicated_stats = {k:v for (k,v) in stats.items()}
593
594 cur_time = self.meta_infos['last_time']
595 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
596 self.logger.info(stats)
597
598 if not 'month_stats' in self.current_analysis.keys():
599 self.current_analysis['month_stats'] = stats
600 else:
601 for (k,v) in stats.items():
602 self.current_analysis['month_stats'][k] = v
603
604 self.valid_visitors = {}
605 for (k,v) in visits.items():
606 if self.isValidVisitor(v):
607 self.valid_visitors[k] = v
608
609 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
610
611 self._callPlugins(conf.POST_HOOK_DIRECTORY)
612
613 path = self.getDBFilename(cur_time)
614 if os.path.exists(path):
615 os.remove(path)
616
617 self.logger.info("==> Serialize to %s" % (path))
618 self._serialize(self.current_analysis, path)
619
620 # Save month stats
621 year = cur_time.tm_year
622 month = cur_time.tm_mon
623 if not 'stats' in self.meta_infos.keys():
624 self.meta_infos['stats'] = {}
625 if not year in self.meta_infos['stats'].keys():
626 self.meta_infos['stats'][year] = {}
627 self.meta_infos['stats'][year][month] = duplicated_stats
628
629 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
630 self._serialize(self.meta_infos, conf.META_PATH)
631
632 self._generateDisplay()
633
634 def _generateDayStats(self):
635 visits = self.current_analysis['visits']
636 cur_time = self.meta_infos['last_time']
637
638 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
639
640 stats = self._createEmptyStats()
641
642 for (k, super_hit) in visits.items():
643 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
644 continue
645 viewed_pages = False
646 for hit in super_hit['requests'][::-1]:
647 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
648 break
649 if super_hit['robot'] or\
650 not self.hasBeenViewed(hit):
651 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
652 continue
653 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
654 if hit['is_page']:
655 stats['viewed_pages'] += 1
656 viewed_pages = True
657 else:
658 stats['viewed_hits'] += 1
659 if (conf.count_hit_only_visitors or\
660 viewed_pages) and\
661 not super_hit['robot']:
662 stats['nb_visits'] += 1
663
664 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
665 self.logger.info(stats)
666
667 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
668
669 def _newHit(self, hit):
670 if not self.domain_name_re.match(hit['server_name']):
671 self.logger.debug("Not in domain %s" % (hit))
672 return False
673
674 t = self._decodeTime(hit)
675
676 cur_time = self.meta_infos['last_time']
677
678 if cur_time == None:
679 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
680 self.analyse_started = True
681 else:
682 if not self.analyse_started and\
683 time.mktime(t) <= time.mktime(cur_time):
684 self.logger.debug("Not in time")
685 return False
686 self.analyse_started = True
687 if cur_time.tm_mon != t.tm_mon:
688 self._generateDayStats()
689 self._generateMonthStats()
690 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
691 elif cur_time.tm_mday != t.tm_mday:
692 self._generateDayStats()
693
694 self.meta_infos['last_time'] = t
695
696 if not self.meta_infos['start_analysis_time']:
697 self.meta_infos['start_analysis_time'] = t
698
699 if not self._decodeHTTPRequest(hit): return False
700
701 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
702 return False
703
704 for k in hit.keys():
705 if hit[k] == '-' or hit[k] == '*':
706 hit[k] = ''
707
708 self._appendHit(hit)
709
710 return True
711
712 def _reset(self):
713 reset_time = time.strptime(self.args.reset, '%m/%Y')
714
715 self.logger.info('Reset time')
716 self.logger.info(reset_time)
717
718 self.meta_infos['last_time'] = reset_time
719
720 cur_time = time.localtime()
721 year = reset_time.tm_year
722 while year < cur_time.tm_year:
723 db_path = os.path.join(conf.DB_ROOT, str(year))
724 if os.path.exists(db_path): shutil.rmtree(db_path)
725 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
726 if os.path.exists(output_path): shutil.rmtree(output_path)
727 year += 1
728 month = reset_time.tm_mon
729 while month <= cur_time.tm_mon:
730 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
731 if os.path.exists(db_path): shutil.rmtree(db_path)
732 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
733 if os.path.exists(output_path): shutil.rmtree(output_path)
734 month += 1
735
736 def start(self, _file, args):
737 self.args = args
738 self.start_time = datetime.now()
739
740 self.logger.info('==> Load previous database')
741
742 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
743 if self.meta_infos['last_time']:
744 if args.reset:
745 self._reset()
746 self.logger.info('Last time')
747 self.logger.info(self.meta_infos['last_time'])
748 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
749 else:
750 self._clearVisits()
751
752 self.meta_infos['start_analysis_time'] = None
753
754 self.cache_plugins = preloadPlugins(self.plugins, self)
755
756 self.logger.info('==> Analysing log')
757
758 for l in _file:
759 # print "line " + l
760
761 groups = self.log_re.match(l)
762
763 if groups:
764 self._newHit(groups.groupdict())
765 else:
766 self.logger.warning("No match for %s" % (l))
767 #break
768
769 if self.analyse_started:
770 self._generateDayStats()
771 self._generateMonthStats()
772 del self.meta_infos['start_analysis_time']
773 else:
774 self.logger.info('==> Analyse not started : nothing new')
775
776
777class FileIter(object):
778 def __init__(self, filenames):
779 self.filenames = [f for f in filenames.split(',') if f]
780 for f in self.filenames:
781 if not os.path.exists(f):
782 print 'No such file \'%s\'' % (f)
783 sys.exit(-1)
784 self.cur_file = None
785 self._openNextFile()
786
787 def __iter__(self):
788 return self
789
790 def __next__(self):
791 return self.next()
792
793 def _openNextFile(self):
794 if self.cur_file:
795 self.cur_file.close()
796 self.cur_file = None
797 if not self.filenames:
798 raise StopIteration()
799 filename = self.filenames.pop(0)
800 if filename.endswith('gz'):
801 self.cur_file = gzip.open(filename, 'r')
802 else:
803 self.cur_file = open(filename)
804
805 def next(self):
806 l = self.cur_file.readline()
807 if not l:
808 self._openNextFile()
809 l = self.cur_file.readline()
810 return l[:-1]
811
812if __name__ == '__main__':
813 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
814
815 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
816 default=False,
817 help='Clean output before starting')
818
819 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
820 default=False,
821 help='Read data from stdin instead of conf.analyzed_filename')
822
823 parser.add_argument('-f', '--file', dest='file',
824 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
825
826 parser.add_argument('-d', '--log-level', dest='loglevel',
827 default='INFO', type=str,
828 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
829
830 parser.add_argument('-r', '--reset', dest='reset',
831 default=False,
832 help='Reset analysis to a specific date (month/year)')
833
834 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
835 default=False,
836 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
837
838 args = parser.parse_args()
839
840 # Load user conf
841 for (k,v) in user_conf.__dict__.items():
842 if k.endswith('_append'):
843 new_k = k[:-7]
844 if new_k in dir(conf):
845 if type(conf.__dict__[new_k]) == list:
846 if type(v) == list:
847 conf.__dict__[new_k] += v
848 else:
849 conf.__dict__[new_k].append(v)
850 else:
851 print("Error %s is not a list" % (new_k))
852 else:
853 print("Error %s doesn't exists in default conf" % (new_k))
854 else:
855 conf.__dict__.update({k:v})
856
857 if args.clean_output:
858 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
859 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
860
861 loglevel = getattr(logging, args.loglevel.upper(), None)
862 if not isinstance(loglevel, int):
863 raise ValueError('Invalid log level: %s' % (args.loglevel))
864
865 iwla = IWLA(loglevel)
866
867 required_conf = ['analyzed_filename', 'domain_name']
868 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
869 sys.exit(0)
870
871 if args.stdin:
872 iwla.start(sys.stdin, args)
873 else:
874 filename = args.file or conf.analyzed_filename
875 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags