iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright Grégory Soutadé 2015
5
6# This file is part of iwla
7
8# iwla is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# iwla is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with iwla. If not, see <http://www.gnu.org/licenses/>.
20#
21
22import os
23import shutil
24import sys
25import re
26import time
27import pickle
28import gzip
29import importlib
30import argparse
31import logging
32import gettext
33from calendar import monthrange
34from datetime import date, datetime
35
36import default_conf as conf
37import conf as user_conf
38
39from iplugin import *
40from display import *
41
42"""
43Main class IWLA
44Parse Log, compute them, call plugins and produce output
45For now, only HTTP log are valid
46
47Plugin requirements :
48 None
49
50Conf values needed :
51 analyzed_filename
52 domain_name
53 locales_path
54 compress_output_files*
55
56Output files :
57 DB_ROOT/meta.db
58 DB_ROOT/year/month/iwla.db
59 OUTPUT_ROOT/index.html
60 OUTPUT_ROOT/year/_stats.html
61 OUTPUT_ROOT/year/month/index.html
62
63Statistics creation :
64
65meta :
66 last_time
67 start_analysis_time
68 stats =>
69 year =>
70 month =>
71 viewed_bandwidth
72 not_viewed_bandwidth
73 viewed_pages
74 viewed_hits
75 nb_visits
76 nb_visitors
77
78month_stats :
79 viewed_bandwidth
80 not_viewed_bandwidth
81 viewed_pages
82 viewed_hits
83 nb_visits
84
85days_stats :
86 day =>
87 viewed_bandwidth
88 not_viewed_bandwidth
89 viewed_pages
90 viewed_hits
91 nb_visits
92 nb_visitors
93
94visits :
95 remote_addr =>
96 remote_addr
97 remote_ip
98 viewed_pages
99 viewed_hits
100 not_viewed_pages
101 not_viewed_hits
102 bandwidth
103 last_access
104 requests =>
105 [fields_from_format_log]
106 extract_request =>
107 http_method
108 http_uri
109 http_version
110 extract_uri
111 extract_parameters*
112 extract_referer* =>
113 extract_uri
114 extract_parameters*
115 robot
116 hit_only
117 is_page
118
119valid_visitors:
120 month_stats without robot and hit only visitors (if not conf.count_hit_only_visitors)
121
122Statistics update :
123 None
124
125Statistics deletion :
126 None
127"""
128
129
130class IWLA(object):
131
132 ANALYSIS_CLASS = 'HTTP'
133 API_VERSION = 1
134 IWLA_VERSION = '0.4-dev'
135
136 def __init__(self, logLevel):
137 self.meta_infos = {}
138 self.analyse_started = False
139 self.current_analysis = {}
140 self.start_time = 0
141 self.cache_plugins = {}
142 self.display = DisplayHTMLBuild(self)
143 self.valid_visitors = None
144
145 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
146 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
147 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
148 self.log_re = re.compile(self.log_format_extracted)
149 self.uri_re = re.compile(r'(?P<extract_uri>[^\?#]+)(\?(?P<extract_parameters>[^#]+))?(#.*)?')
150 self.domain_name_re = re.compile(r'.*%s' % conf.domain_name)
151 self.final_slashes_re = re.compile(r'/+$')
152 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
153 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
154 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
155
156 logging.basicConfig(format='%(name)s %(message)s', level=logLevel)
157 self.logger = logging.getLogger(self.__class__.__name__)
158 self.logger.info('==> Start')
159 try:
160 t = gettext.translation('iwla', localedir=conf.locales_path, languages=[conf.locale], codeset='utf8')
161 self.logger.info('\tUsing locale %s' % (conf.locale))
162 except IOError:
163 t = gettext.NullTranslations()
164 self.logger.info('\tUsing default locale en_EN')
165 self._ = t.ugettext
166
167 def getVersion(self):
168 return IWLA.IWLA_VERSION
169
170 def getConfValue(self, key, default=None):
171 if not key in dir(conf):
172 return default
173 else:
174 return conf.__dict__[key]
175
176 def _clearVisits(self):
177 self.current_analysis = {
178 'days_stats' : {},
179 'month_stats' : {},
180 'visits' : {}
181 }
182 self.valid_visitors = None
183 return self.current_analysis
184
185 def getDaysStats(self):
186 return self.current_analysis['days_stats']
187
188 def getMonthStats(self):
189 return self.current_analysis['month_stats']
190
191 def getCurrentVisits(self):
192 return self.current_analysis['visits']
193
194 def getValidVisitors(self):
195 return self.valid_visitors
196
197 def getDisplay(self):
198 return self.display
199
200 def getCurTime(self):
201 return self.meta_infos['last_time']
202
203 def getStartAnalysisTime(self):
204 return self.meta_infos['start_analysis_time']
205
206 def isValidForCurrentAnalysis(self, request):
207 cur_time = self.meta_infos['start_analysis_time']
208 # Analyse not started
209 if not cur_time: return False
210 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
211
212 def hasBeenViewed(self, request):
213 return int(request['status']) in conf.viewed_http_codes
214
215 def getCurDisplayPath(self, filename):
216 cur_time = self.meta_infos['last_time']
217 return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
218
219 def getResourcesPath(self):
220 return conf.resources_path
221
222 def getCSSPath(self):
223 return conf.css_path
224
225 def _clearMeta(self):
226 self.meta_infos = {
227 'last_time' : None,
228 'start_analysis_time' : None
229 }
230 return self.meta_infos
231
232 def _clearDisplay(self):
233 self.display.clear()
234return self.display
235
236 def getDBFilename(self, time):
237 return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
238
239 def _openDB(self, filename, prot='r'):
240 if self.args.dont_compress:
241 return open(filename, prot)
242 else:
243 return gzip.open(filename, prot)
244
245 def _serialize(self, obj, filename):
246 base = os.path.dirname(filename)
247 if not os.path.exists(base):
248 os.makedirs(base)
249
250 with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
251 pickle.dump(obj, f)
252 f.seek(0)
253 fzip.write(f.read())
254 os.remove(filename + '.tmp')
255
256 def _deserialize(self, filename):
257 if not os.path.exists(filename):
258 return None
259
260 with self._openDB(filename) as f:
261 return pickle.load(f)
262 return None
263
264 def _callPlugins(self, target_root, *args):
265 self.logger.info('==> Call plugins (%s)' % (target_root))
266 for (root, plugins) in self.plugins:
267 if root != target_root: continue
268 for p in plugins:
269 mod = self.cache_plugins.get(root + '.' + p, None)
270 if mod:
271 self.logger.info('\t%s' % (p))
272 mod.hook(*args)
273
274 def isPage(self, request):
275 self.logger.debug("Is page %s" % (request))
276 for e in conf.pages_extensions:
277 if request.endswith(e):
278 self.logger.debug("True")
279 return True
280 self.logger.debug("False")
281 return False
282
283 def isMultimediaFile(self, request):
284 self.logger.debug("Is multimedia %s" % (request))
285 for e in conf.multimedia_files:
286 if request.endswith(e):
287 self.logger.debug("True")
288 return True
289 self.logger.debug("False")
290 return False
291
292 def isValidVisitor(self, hit):
293 if hit['robot']: return False
294 if not (conf.count_hit_only_visitors or\
295 hit['viewed_pages']):
296 return False
297 return True
298
299 def _appendHit(self, hit):
300 remote_addr = hit['remote_addr']
301
302 if not remote_addr: return
303
304 if not remote_addr in self.current_analysis['visits'].keys():
305 self._createVisitor(hit)
306
307 super_hit = self.current_analysis['visits'][remote_addr]
308 super_hit['requests'].append(hit)
309 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
310 super_hit['last_access'] = self.meta_infos['last_time']
311
312 request = hit['extract_request']
313
314 uri = request.get('extract_uri', request['http_uri'])
315
316 hit['is_page'] = self.isPage(uri)
317
318 if super_hit['robot'] or\
319 not self.hasBeenViewed(hit):
320 page_key = 'not_viewed_pages'
321 hit_key = 'not_viewed_hits'
322 else:
323 page_key = 'viewed_pages'
324 hit_key = 'viewed_hits'
325
326 if hit['is_page']:
327 super_hit[page_key] += 1
328 else:
329 super_hit[hit_key] += 1
330
331 def _createVisitor(self, hit):
332 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
333 super_hit['remote_addr'] = hit['remote_addr']
334 super_hit['remote_ip'] = hit['remote_addr']
335 super_hit['viewed_pages'] = 0
336 super_hit['viewed_hits'] = 0
337 super_hit['not_viewed_pages'] = 0
338 super_hit['not_viewed_hits'] = 0
339 super_hit['bandwidth'] = 0
340 super_hit['last_access'] = self.meta_infos['last_time']
341 super_hit['requests'] = []
342 super_hit['robot'] = False
343 super_hit['hit_only'] = 0
344
345 def _normalizeURI(self, uri):
346 if uri == '/': return uri
347 uri = self.final_slashes_re.sub('/', uri)
348 return uri
349
350 def _removeFinalSlashes(self, uri):
351 if uri == '/': return uri
352 return self.final_slashes_re.sub('', uri)
353
354 def _normalizeParameters(self, parameters):
355 # No parameters
356 if parameters == '?': return None
357 return parameters
358
359 def _decodeHTTPRequest(self, hit):
360 if not 'request' in hit.keys(): return False
361
362 groups = self.http_request_extracted.match(hit['request'])
363
364 if groups:
365 hit['extract_request'] = groups.groupdict()
366 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
367 if uri_groups:
368 d = uri_groups.groupdict()
369 hit['extract_request']['extract_uri'] = self._normalizeURI(d['extract_uri'])
370 if 'extract_parameters' in d.keys():
371 parameters = self._normalizeParameters(d['extract_parameters'])
372 if parameters:
373 hit['extract_request']['extract_parameters'] = parameters
374 else:
375 self.logger.warning("Bad request extraction %s" % (hit['request']))
376 return False
377
378 if hit['http_referer']:
379 referer_groups = self.uri_re.match(hit['http_referer'])
380 if referer_groups:
381 hit['extract_referer'] = referer_groups.groupdict()
382 hit['extract_referer']['extract_uri'] = self._removeFinalSlashes(hit['extract_referer']['extract_uri'])
383 hit['extract_referer']['extract_parameters'] = self._normalizeParameters(hit['extract_referer']['extract_parameters'])
384 return True
385
386 def _decodeTime(self, hit):
387 try:
388 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
389 except ValueError, e:
390 if sys.version_info < (3, 2):
391 # Try without UTC value at the end (%z not recognized)
392 gmt_offset_str = hit['time_local'][-5:]
393 gmt_offset_hours = int(gmt_offset_str[1:3])*60*60
394 gmt_offset_minutes = int(gmt_offset_str[3:5])*60
395 gmt_offset = gmt_offset_hours + gmt_offset_minutes
396 hit['time_decoded'] = time.strptime(hit['time_local'][:-6], conf.time_format[:-3])
397 # if gmt_offset_str[0] == '-':
398 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])+gmt_offset)
399 # else:
400 # hit['time_decoded'] = time.localtime(time.mktime(hit['time_decoded'])-gmt_offset)
401 else:
402 raise e
403 return hit['time_decoded']
404
405 def getDisplayIndex(self):
406 cur_time = self.meta_infos['last_time']
407 filename = self.getCurDisplayPath('index.html')
408
409 return self.display.getPage(filename)
410
411 def _generateDisplayDaysStats(self):
412 cur_time = self.meta_infos['last_time']
413 title = createCurTitle(self, self._('Statistics'))
414 filename = self.getCurDisplayPath('index.html')
415 self.logger.info('==> Generate display (%s)' % (filename))
416 page = self.display.createPage(title, filename, conf.css_path)
417 link = DisplayHTMLRaw(self, '<iframe src="../_stats.html"></iframe>')
418 page.appendBlock(link)
419
420 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
421 days = self.display.createBlock(DisplayHTMLBlockTableWithGraph, self._('By day'), [self._('Day'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth')], None, nb_month_days, range(1,6))
422 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
423 nb_visits = 0
424 nb_days = 0
425 for i in range(1, nb_month_days+1):
426 day = '%d<br/>%s' % (i, time.strftime('%b', cur_time))
427 full_day = '%02d %s %d' % (i, time.strftime('%b', cur_time), cur_time.tm_year)
428 if i in self.current_analysis['days_stats'].keys():
429 stats = self.current_analysis['days_stats'][i]
430 row = [full_day, stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
431 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
432 nb_visits += stats['nb_visits']
433 nb_days += 1
434 else:
435 row = [full_day, 0, 0, 0, 0, 0]
436 days.appendRow(row)
437 days.setCellValue(i-1, 4, bytesToStr(row[4]))
438 days.setCellValue(i-1, 5, bytesToStr(row[5]))
439 days.appendShortTitle(day)
440 adate = date(cur_time.tm_year, cur_time.tm_mon, i)
441 week_day = adate.weekday()
442 if week_day == 5 or week_day == 6:
443 days.setRowCSSClass(i-1, 'iwla_weekend')
444 if adate == date.today():
445 css = days.getCellCSSClass(i-1, 0)
446 if css: css = '%s %s' % (css, 'iwla_curday')
447 else: css = 'iwla_curday'
448 days.setCellCSSClass(i-1, 0, css)
449
450 stats = self.current_analysis['month_stats']
451
452 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
453 if nb_days:
454 average_row = map(lambda(v): int(v/nb_days), row)
455 else:
456 average_row = map(lambda(v): 0, row)
457
458 average_row[0] = self._('Average')
459 average_row[4] = bytesToStr(average_row[4])
460 average_row[5] = bytesToStr(average_row[5])
461 days.appendRow(average_row)
462
463 row[0] = self._('Total')
464 row[4] = bytesToStr(row[4])
465 row[5] = bytesToStr(row[5])
466 days.appendRow(row)
467 page.appendBlock(days)
468 self.display.addPage(page)
469
470 def _generateDisplayMonthStats(self, page, year, month_stats):
471 cur_time = time.localtime()
472 months_name = ['', self._('Jan'), self._('Feb'), self._('Mar'), self._('Apr'), self._('May'), self._('June'), self._('Jul'), self._('Aug'), self._('Sep'), self._('Oct'), self._('Nov'), self._('Dec')]
473 title = '%s %d' % (self._('Summary'), year)
474 cols = [self._('Month'), self._('Visitors'), self._('Visits'), self._('Pages'), self._('Hits'), self._('Bandwidth'), self._('Not viewed Bandwidth'), self._('Details')]
475 graph_cols=range(1,7)
476 months = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols, None, 12, graph_cols)
477 months.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth', ''])
478 months_ = self.display.createBlock(DisplayHTMLBlockTableWithGraph, title, cols[:-1], None, 12, graph_cols[:-1])
479 months_.setColsCSSClass(['', 'iwla_visitor', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
480 total = [0] * len(cols)
481 for i in range(1, 13):
482 month = '%s<br/>%d' % (months_name[i], year)
483 full_month = '%s %d' % (months_name[i], year)
484 if i in month_stats.keys():
485 stats = month_stats[i]
486 link = '<a href="%d/%02d/index.html">%s</a>' % (year, i, self._('Details'))
487 row = [full_month, stats['nb_visitors'], stats['nb_visits'], stats['viewed_pages'], stats['viewed_hits'],
488 stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
489 for j in graph_cols:
490 total[j] += row[j]
491 else:
492 row = [full_month, 0, 0, 0, 0, 0, 0, '']
493 months.appendRow(row)
494 months.setCellValue(i-1, 5, bytesToStr(row[5]))
495 months.setCellValue(i-1, 6, bytesToStr(row[6]))
496 months.appendShortTitle(month)
497 months_.appendRow(row[:-1])
498 months_.setCellValue(i-1, 5, bytesToStr(row[5]))
499 months_.setCellValue(i-1, 6, bytesToStr(row[6]))
500 months_.appendShortTitle(month)
501 if year == cur_time.tm_year and i == cur_time.tm_mon:
502 css = months.getCellCSSClass(i-1, 0)
503 if css: css = '%s %s' % (css, 'iwla_curday')
504 else: css = 'iwla_curday'
505 months.setCellCSSClass(i-1, 0, css)
506 months_.setCellCSSClass(i-1, 0, css)
507
508 total[0] = self._('Total')
509 total[5] = bytesToStr(total[5])
510 total[6] = bytesToStr(total[6])
511 total[7] = u''
512 months.appendRow(total)
513 page.appendBlock(months)
514
515 months_.appendRow(total[:-1])
516 filename = '%d/_stats.html' % (year)
517 page_ = self.display.createPage(u'', filename, conf.css_path)
518 page_.appendBlock(months_)
519 page_.build(conf.DISPLAY_ROOT, False)
520
521 def _generateDisplayWholeMonthStats(self):
522 title = '%s %s' % (self._('Statistics for'), conf.domain_name)
523 filename = 'index.html'
524
525 self.logger.info('==> Generate main page (%s)' % (filename))
526
527 page = self.display.createPage(title, filename, conf.css_path)
528
529 last_update = u'<b>%s</b> %s<br />' % (self._(u'Last update'), time.strftime('%02d %b %Y %H:%M', time.localtime()))
530 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, last_update))
531 duration = datetime.now() - self.start_time
532 duration = time.gmtime(duration.seconds)
533 time_analysis = u'<b>%s</b> ' % (self._('Time analysis'))
534 if duration.tm_hour:
535 time_analysis += u'%d %s, ' % (duration.tm_hour, self._(u'hours'))
536 time_analysis += u'%d %s and %d %s<br />' % (duration.tm_min, self._(u'minutes'), duration.tm_sec, self._(u'seconds'))
537 page.appendBlock(self.display.createBlock(DisplayHTMLRaw, time_analysis))
538
539 for year in sorted(self.meta_infos['stats'].keys(), reverse=True):
540 self._generateDisplayMonthStats(page, year, self.meta_infos['stats'][year])
541
542 self.display.addPage(page)
543
544 def _compressFile(self, build_time, root, filename):
545 path = os.path.join(root, filename)
546 gz_path = path + '.gz'
547
548 self.logger.debug('Compress %s => %s' % (path, gz_path))
549
550 if not os.path.exists(gz_path) or\
551 os.stat(path).st_mtime >= build_time:
552 with open(path, 'rb') as f_in, gzip.open(gz_path, 'wb') as f_out:
553 f_out.write(f_in.read())
554
555 def _compressFiles(self, build_time, root):
556 if not conf.compress_output_files: return
557 for rootdir, subdirs, files in os.walk(root, followlinks=True):
558 for f in files:
559 for ext in conf.compress_output_files:
560 if f.endswith(ext):
561 self._compressFile(build_time, rootdir, f)
562 break
563
564 def _generateDisplay(self):
565 self._generateDisplayDaysStats()
566 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
567 self._generateDisplayWholeMonthStats()
568 build_time = time.mktime(time.localtime())
569 self.display.build(conf.DISPLAY_ROOT)
570 self._compressFiles(build_time, conf.DISPLAY_ROOT)
571
572 def _createEmptyStats(self):
573 stats = {}
574 stats['viewed_bandwidth'] = 0
575 stats['not_viewed_bandwidth'] = 0
576 stats['viewed_pages'] = 0
577 stats['viewed_hits'] = 0
578 stats['nb_visits'] = 0
579
580 return stats
581
582 def _generateMonthStats(self):
583 self._clearDisplay()
584
585 visits = self.current_analysis['visits']
586
587 stats = self._createEmptyStats()
588 for (day, stat) in self.current_analysis['days_stats'].items():
589 for k in stats.keys():
590 stats[k] += stat[k]
591
592 duplicated_stats = {k:v for (k,v) in stats.items()}
593
594 cur_time = self.meta_infos['last_time']
595 self.logger.info("== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon))
596 self.logger.info(stats)
597
598 if not 'month_stats' in self.current_analysis.keys():
599 self.current_analysis['month_stats'] = stats
600 else:
601 for (k,v) in stats.items():
602 self.current_analysis['month_stats'][k] = v
603
604 self.valid_visitors = {}
605 for (k,v) in visits.items():
606 if self.isValidVisitor(v):
607 self.valid_visitors[k] = v
608
609 duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
610
611 if args.display_only:
612 self._generateDisplay()
613 return
614
615 self._callPlugins(conf.POST_HOOK_DIRECTORY)
616
617 path = self.getDBFilename(cur_time)
618 if os.path.exists(path):
619 os.remove(path)
620
621 self.logger.info("==> Serialize to %s" % (path))
622 self._serialize(self.current_analysis, path)
623
624 # Save month stats
625 year = cur_time.tm_year
626 month = cur_time.tm_mon
627 if not 'stats' in self.meta_infos.keys():
628 self.meta_infos['stats'] = {}
629 if not year in self.meta_infos['stats'].keys():
630 self.meta_infos['stats'][year] = {}
631 self.meta_infos['stats'][year][month] = duplicated_stats
632
633 self.logger.info("==> Serialize to %s" % (conf.META_PATH))
634 self._serialize(self.meta_infos, conf.META_PATH)
635
636 self._generateDisplay()
637
638 def _generateDayStats(self):
639 if args.display_only:
640 return
641
642 visits = self.current_analysis['visits']
643 cur_time = self.meta_infos['last_time']
644
645 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
646
647 stats = self._createEmptyStats()
648
649 for (k, super_hit) in visits.items():
650 if super_hit['last_access'].tm_mday != cur_time.tm_mday:
651 continue
652 viewed_pages = False
653 for hit in super_hit['requests'][::-1]:
654 if hit['time_decoded'].tm_mday != cur_time.tm_mday:
655 break
656 if super_hit['robot'] or\
657 not self.hasBeenViewed(hit):
658 stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
659 continue
660 stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
661 if hit['is_page']:
662 stats['viewed_pages'] += 1
663 viewed_pages = True
664 else:
665 stats['viewed_hits'] += 1
666 if (conf.count_hit_only_visitors or\
667 viewed_pages) and\
668 not super_hit['robot']:
669 stats['nb_visits'] += 1
670
671 self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
672 self.logger.info(stats)
673
674 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
675
676 def _newHit(self, hit):
677 if not self.domain_name_re.match(hit['server_name']):
678 self.logger.debug("Not in domain %s" % (hit))
679 return False
680
681 t = self._decodeTime(hit)
682
683 cur_time = self.meta_infos['last_time']
684
685 if cur_time == None:
686 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
687 self.analyse_started = True
688 else:
689 if not self.analyse_started and\
690 time.mktime(t) <= time.mktime(cur_time):
691 self.logger.debug("Not in time")
692 return False
693 self.analyse_started = True
694 if cur_time.tm_mon != t.tm_mon:
695 self._generateDayStats()
696 self._generateMonthStats()
697 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
698 elif cur_time.tm_mday != t.tm_mday:
699 self._generateDayStats()
700
701 self.meta_infos['last_time'] = t
702
703 if not self.meta_infos['start_analysis_time']:
704 self.meta_infos['start_analysis_time'] = t
705
706 if not self._decodeHTTPRequest(hit): return False
707
708 if hit['extract_request']['http_method'] not in ['GET', 'POST']:
709 return False
710
711 for k in hit.keys():
712 if hit[k] == '-' or hit[k] == '*':
713 hit[k] = ''
714
715 self._appendHit(hit)
716
717 return True
718
719 def _reset(self):
720 reset_time = time.strptime(self.args.reset, '%m/%Y')
721
722 self.logger.info('Reset time')
723 self.logger.info(reset_time)
724
725 self.meta_infos['last_time'] = reset_time
726
727 cur_time = time.localtime()
728 year = reset_time.tm_year
729 while year < cur_time.tm_year:
730 db_path = os.path.join(conf.DB_ROOT, str(year))
731 if os.path.exists(db_path): shutil.rmtree(db_path)
732 output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
733 if os.path.exists(output_path): shutil.rmtree(output_path)
734 year += 1
735 month = reset_time.tm_mon
736 while month <= cur_time.tm_mon:
737 db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
738 if os.path.exists(db_path): shutil.rmtree(db_path)
739 output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
740 if os.path.exists(output_path): shutil.rmtree(output_path)
741 month += 1
742
743 def start(self, _file, args):
744 self.args = args
745 self.start_time = datetime.now()
746
747 self.logger.info('==> Load previous database')
748
749 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
750 if self.meta_infos['last_time']:
751 if args.reset:
752 self._reset()
753 self.logger.info('Last time')
754 self.logger.info(self.meta_infos['last_time'])
755 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
756 else:
757 self._clearVisits()
758
759 self.meta_infos['start_analysis_time'] = None
760
761 self.cache_plugins = preloadPlugins(self.plugins, self)
762
763 self.logger.info('==> Analysing log')
764
765 for l in _file:
766 # print "line " + l
767
768 groups = self.log_re.match(l)
769
770 if groups:
771 self._newHit(groups.groupdict())
772 else:
773 self.logger.warning("No match for %s" % (l))
774 #break
775
776 if self.analyse_started:
777 self._generateDayStats()
778 self._generateMonthStats()
779 del self.meta_infos['start_analysis_time']
780 else:
781 self.logger.info('==> Analyse not started : nothing new')
782
783
784class FileIter(object):
785 def __init__(self, filenames):
786 self.filenames = [f for f in filenames.split(',') if f]
787 for f in self.filenames:
788 if not os.path.exists(f):
789 print 'No such file \'%s\'' % (f)
790 sys.exit(-1)
791 self.cur_file = None
792 self._openNextFile()
793
794 def __iter__(self):
795 return self
796
797 def __next__(self):
798 return self.next()
799
800 def _openNextFile(self):
801 if self.cur_file:
802 self.cur_file.close()
803 self.cur_file = None
804 if not self.filenames:
805 raise StopIteration()
806 filename = self.filenames.pop(0)
807 if filename.endswith('gz'):
808 self.cur_file = gzip.open(filename, 'r')
809 else:
810 self.cur_file = open(filename)
811
812 def next(self):
813 l = self.cur_file.readline()
814 if not l:
815 self._openNextFile()
816 l = self.cur_file.readline()
817 return l[:-1]
818
819if __name__ == '__main__':
820 parser = argparse.ArgumentParser(description='Intelligent Web Log Analyzer')
821
822 parser.add_argument('-c', '--clean-output', dest='clean_output', action='store_true',
823 default=False,
824 help='Clean output before starting')
825
826 parser.add_argument('-i', '--stdin', dest='stdin', action='store_true',
827 default=False,
828 help='Read data from stdin instead of conf.analyzed_filename')
829
830 parser.add_argument('-f', '--file', dest='file',
831 help='Analyse this log file, multiple files can be specified (comma separated). gz files are accepted')
832
833 parser.add_argument('-d', '--log-level', dest='loglevel',
834 default='INFO', type=str,
835 help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
836
837 parser.add_argument('-r', '--reset', dest='reset',
838 default=False,
839 help='Reset analysis to a specific date (month/year)')
840
841 parser.add_argument('-z', '--dont-compress', dest='dont_compress', action='store_true',
842 default=False,
843 help='Don\'t compress databases (bigger but faster, not compatible with compressed databases)')
844
845 parser.add_argument('-p', '--display-only', dest='display_only', action='store_true',
846 default=False,
847 help='Only generate display')
848
849 args = parser.parse_args()
850
851 # Load user conf
852 for (k,v) in user_conf.__dict__.items():
853 if k.endswith('_append'):
854 new_k = k[:-7]
855 if new_k in dir(conf):
856 if type(conf.__dict__[new_k]) == list:
857 if type(v) == list:
858 conf.__dict__[new_k] += v
859 else:
860 conf.__dict__[new_k].append(v)
861 else:
862 print("Error %s is not a list" % (new_k))
863 else:
864 print("Error %s doesn't exists in default conf" % (new_k))
865 else:
866 conf.__dict__.update({k:v})
867
868 if args.clean_output:
869 if os.path.exists(conf.DB_ROOT): shutil.rmtree(conf.DB_ROOT)
870 if os.path.exists(conf.DISPLAY_ROOT): shutil.rmtree(conf.DISPLAY_ROOT)
871
872 loglevel = getattr(logging, args.loglevel.upper(), None)
873 if not isinstance(loglevel, int):
874 raise ValueError('Invalid log level: %s' % (args.loglevel))
875
876 iwla = IWLA(loglevel)
877
878 required_conf = ['analyzed_filename', 'domain_name']
879 if not validConfRequirements(required_conf, iwla, 'Main Conf'):
880 sys.exit(0)
881
882 if args.stdin:
883 iwla.start(sys.stdin, args)
884 else:
885 filename = args.file or conf.analyzed_filename
886 iwla.start(FileIter(filename), args)

Archive Download this file

Branches

Tags