iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import re
5import time
6import pickle
7import gzip
8import importlib
9from calendar import monthrange
10from datetime import date
11
12import default_conf as conf
13import conf as _
14conf.__dict__.update(_.__dict__)
15del _
16
17from iplugin import *
18from display import *
19
20class IWLA(object):
21
22 ANALYSIS_CLASS = 'HTTP'
23 API_VERSION = 1
24
25 def __init__(self):
26 print '==> Start'
27
28 self.meta_infos = {}
29 self.analyse_started = False
30 self.current_analysis = {}
31 self.cache_plugins = {}
32 self.display = DisplayHTMLBuild(self)
33 self.valid_visitors = None
34
35 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', conf.log_format)
36 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
37 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
38 self.log_re = re.compile(self.log_format_extracted)
39 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]+)(\?(?P<extract_parameters>.+))?')
40 self.plugins = [(conf.PRE_HOOK_DIRECTORY , conf.pre_analysis_hooks),
41 (conf.POST_HOOK_DIRECTORY , conf.post_analysis_hooks),
42 (conf.DISPLAY_HOOK_DIRECTORY , conf.display_hooks)]
43
44 def getConfValue(self, key, default=None):
45 if not key in dir(conf):
46 return default
47 else:
48 return conf.__dict__[key]
49
50 def _clearVisits(self):
51 self.current_analysis = {
52 'days_stats' : {},
53 'month_stats' : {},
54 'visits' : {}
55 }
56 self.valid_visitors = None
57 return self.current_analysis
58
59 def getDaysStats(self):
60 return self.current_analysis['days_stats']
61
62 def getMonthStats(self):
63 return self.current_analysis['month_stats']
64
65 def getCurrentVisists(self):
66 return self.current_analysis['visits']
67
68 def getValidVisitors(self):
69 return self.valid_visitors
70
71 def getDisplay(self):
72 return self.display
73
74 def getCurTime(self):
75 return self.meta_infos['last_time']
76
77 def getStartAnalysisTime(self):
78 return self.meta_infos['start_analysis_time']
79
80 def isValidForCurrentAnalysis(self, request):
81 cur_time = self.meta_infos['start_analysis_time']
82 # Analyse not started
83 if not cur_time: return False
84 return (time.mktime(cur_time) < time.mktime(request['time_decoded']))
85
86 def hasBeenViewed(self, request):
87 return int(request['status']) in conf.viewed_http_codes
88
89 def getCurDisplayPath(self, filename):
90 cur_time = self.meta_infos['last_time']
91 return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename)
92
93 def getResourcesPath(self):
94 return conf.resources_path
95
96 def getCSSPath(self):
97 return conf.css_path
98
99 def _clearMeta(self):
100 self.meta_infos = {
101 'last_time' : None
102 }
103 return self.meta_infos
104
105 def _clearDisplay(self):
106 self.display = DisplayHTMLBuild(self)
107return self.display
108
109 def getDBFilename(self, time):
110 return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME)
111
112 def _serialize(self, obj, filename):
113 base = os.path.dirname(filename)
114 if not os.path.exists(base):
115 os.makedirs(base)
116
117 # TODO : remove return
118 #return
119
120 with open(filename + '.tmp', 'wb+') as f:
121 pickle.dump(obj, f)
122 f.seek(0)
123 with gzip.open(filename, 'w') as fzip:
124 fzip.write(f.read())
125 os.remove(filename + '.tmp')
126
127 def _deserialize(self, filename):
128 if not os.path.exists(filename):
129 return None
130
131 with gzip.open(filename, 'r') as f:
132 return pickle.load(f)
133 return None
134
135 def _callPlugins(self, target_root, *args):
136 print '==> Call plugins (%s)' % target_root
137 for (root, plugins) in self.plugins:
138 if root != target_root: continue
139 for p in plugins:
140 mod = self.cache_plugins.get(root + '.' + p, None)
141 if mod:
142 print '\t%s' % (p)
143 mod.hook(*args)
144
145 def isPage(self, request):
146 for e in conf.pages_extensions:
147 if request.endswith(e):
148 return True
149
150 return False
151
152 def _appendHit(self, hit):
153 remote_addr = hit['remote_addr']
154
155 if not remote_addr: return
156
157 if not remote_addr in self.current_analysis['visits'].keys():
158 self._createVisitor(hit)
159 return
160
161 super_hit = self.current_analysis['visits'][remote_addr]
162 super_hit['requests'].append(hit)
163 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
164 super_hit['last_access'] = self.meta_infos['last_time']
165
166 request = hit['extract_request']
167
168 uri = request.get('extract_uri', request['http_uri'])
169
170 hit['is_page'] = self.isPage(uri)
171
172 status = int(hit['status'])
173 if status not in conf.viewed_http_codes:
174 return
175
176 if super_hit['robot'] or\
177 not status in conf.viewed_http_codes:
178 page_key = 'not_viewed_pages'
179 hit_key = 'not_viewed_hits'
180 else:
181 page_key = 'viewed_pages'
182 hit_key = 'viewed_hits'
183
184 if hit['is_page']:
185 super_hit[page_key] += 1
186 else:
187 super_hit[hit_key] += 1
188
189 def _createVisitor(self, hit):
190 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
191 super_hit['remote_addr'] = hit['remote_addr']
192 super_hit['remote_ip'] = hit['remote_addr']
193 super_hit['viewed_pages'] = 0
194 super_hit['viewed_hits'] = 0
195 super_hit['not_viewed_pages'] = 0
196 super_hit['not_viewed_hits'] = 0
197 super_hit['bandwidth'] = 0
198 super_hit['last_access'] = self.meta_infos['last_time']
199 super_hit['requests'] = []
200 super_hit['robot'] = False
201 super_hit['hit_only'] = 0
202 self._appendHit(hit)
203
204 def _decodeHTTPRequest(self, hit):
205 if not 'request' in hit.keys(): return False
206
207 groups = self.http_request_extracted.match(hit['request'])
208
209 if groups:
210 hit['extract_request'] = groups.groupdict()
211 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
212 if uri_groups:
213 d = uri_groups.groupdict()
214 hit['extract_request']['extract_uri'] = d['extract_uri']
215 if 'extract_parameters' in d.keys():
216 hit['extract_request']['extract_parameters'] = d['extract_parameters']
217 else:
218 print "Bad request extraction " + hit['request']
219 return False
220
221 if hit['http_referer']:
222 referer_groups = self.uri_re.match(hit['http_referer'])
223 if referer_groups:
224 hit['extract_referer'] = referer_groups.groupdict()
225 return True
226
227 def _decodeTime(self, hit):
228 hit['time_decoded'] = time.strptime(hit['time_local'], conf.time_format)
229 return hit['time_decoded']
230
231 def getDisplayIndex(self):
232 cur_time = self.meta_infos['last_time']
233 filename = self.getCurDisplayPath('index.html')
234
235 return self.display.getPage(filename)
236
237 def _generateDisplayDaysStat(self):
238 cur_time = self.meta_infos['last_time']
239 title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
240 filename = self.getCurDisplayPath('index.html')
241 print '==> Generate display (%s)' % (filename)
242 page = DisplayHTMLPage(title, filename, conf.css_path)
243
244 _, nb_month_days = monthrange(cur_time.tm_year, cur_time.tm_mon)
245 days = DisplayHTMLBlockTableWithGraph('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'], nb_valid_rows=nb_month_days)
246 days.setColsCSSClass(['', 'iwla_visit', 'iwla_page', 'iwla_hit', 'iwla_bandwidth', 'iwla_bandwidth'])
247 nb_visits = 0
248 nb_days = 0
249 for i in range(0, nb_month_days):
250 cur_day = '%d %s' % (i+1, time.strftime('%b', cur_time))
251 full_cur_day = '%s %d' % (cur_day, cur_time.tm_year)
252 if i in self.current_analysis['days_stats'].keys():
253 stats = self.current_analysis['days_stats'][i]
254 row = [full_cur_day, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
255 stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
256 nb_visits += stats['nb_visitors']
257 nb_days += 1
258 else:
259 row = [full_cur_day, 0, 0, 0, 0, 0]
260 days.appendRow(row)
261 days.setCellValue(i, 4, bytesToStr(row[4]))
262 days.setCellValue(i, 5, bytesToStr(row[5]))
263 days.appendShortTitle(cur_day)
264 week_day = date(cur_time.tm_year, cur_time.tm_mon, i+1).weekday()
265 if week_day == 5 or week_day == 6:
266 days.setRowCSSClass(i, 'iwla_weekend')
267
268 stats = self.current_analysis['month_stats']
269
270 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
271 if nb_days:
272 average_row = map(lambda(v): int(v/nb_days), row)
273 else:
274 average_row = map(lambda(v): 0, row)
275
276 average_row[0] = 'Average'
277 average_row[4] = bytesToStr(average_row[4])
278 average_row[5] = bytesToStr(average_row[5])
279 days.appendRow(average_row)
280
281 row[0] = 'Total'
282 row[4] = bytesToStr(row[4])
283 row[5] = bytesToStr(row[5])
284 days.appendRow(row)
285 page.appendBlock(days)
286 self.display.addPage(page)
287
288 def _generateDisplay(self):
289 self._generateDisplayDaysStat()
290 self._callPlugins(conf.DISPLAY_HOOK_DIRECTORY)
291 self.display.build(conf.DISPLAY_ROOT)
292
293 def _generateStats(self, visits):
294 stats = {}
295 stats['viewed_bandwidth'] = 0
296 stats['not_viewed_bandwidth'] = 0
297 stats['viewed_pages'] = 0
298 stats['viewed_hits'] = 0
299 #stats['requests'] = set()
300 stats['nb_visitors'] = 0
301
302 for (k, super_hit) in visits.items():
303 if super_hit['robot']:
304 stats['not_viewed_bandwidth'] += super_hit['bandwidth']
305 continue
306
307 #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
308
309 if conf.count_hit_only_visitors or\
310 super_hit['viewed_pages']:
311 stats['nb_visitors'] += 1
312 stats['viewed_bandwidth'] += super_hit['bandwidth']
313 stats['viewed_pages'] += super_hit['viewed_pages']
314 stats['viewed_hits'] += super_hit['viewed_hits']
315
316 # for p in super_hit['requests']:
317 # if not p['is_page']: continue
318 # req = p['extract_request']
319 # stats['requests'].add(req['extract_uri'])
320
321 return stats
322
323 def _generateMonthStats(self):
324 self._clearDisplay()
325
326 visits = self.current_analysis['visits']
327
328 stats = self._generateStats(visits)
329 duplicated_stats = {k:v for (k,v) in stats.items()}
330
331 cur_time = self.meta_infos['last_time']
332 print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
333 print stats
334
335 if not 'month_stats' in self.current_analysis.keys():
336 self.current_analysis['month_stats'] = stats
337 else:
338 for (k,v) in stats.items():
339 self.current_analysis['month_stats'][k] = v
340
341 self.valid_visitors = {}
342 for (k,v) in visits.items():
343 if v['robot']: continue
344 if conf.count_hit_only_visitors and\
345 (not v['viewed_pages']):
346 continue
347 self.valid_visitors[k] = v
348
349 self._callPlugins(conf.POST_HOOK_DIRECTORY)
350
351 path = self.getDBFilename(cur_time)
352 if os.path.exists(path):
353 os.remove(path)
354
355 print "==> Serialize to %s" % path
356
357 self._serialize(self.current_analysis, path)
358
359 self._generateDisplay()
360
361 # Save month stats
362 year = '%d' % (cur_time.tm_year)
363 month = '%d' % (cur_time.tm_mon)
364 if not 'stats' in self.meta_infos.keys():
365 self.meta_infos['stats'] = {}
366 if not year in self.meta_infos['stats'].keys():
367 self.meta_infos['stats'][year] = {}
368 self.meta_infos['stats'][year][month] = duplicated_stats
369
370 def _generateDayStats(self):
371 visits = self.current_analysis['visits']
372
373 self._callPlugins(conf.PRE_HOOK_DIRECTORY)
374
375 stats = self._generateStats(visits)
376
377 cur_time = self.meta_infos['last_time']
378 print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
379
380 if cur_time.tm_mday > 1:
381 last_day = cur_time.tm_mday - 1
382 while last_day:
383 if last_day in self.current_analysis['days_stats'].keys():
384 break
385 last_day -= 1
386 if last_day:
387 for k in stats.keys():
388 stats[k] -= self.current_analysis['days_stats'][last_day][k]
389 stats['nb_visitors'] = 0
390 for (k,v) in visits.items():
391 if v['robot']: continue
392 if conf.count_hit_only_visitors and\
393 (not v['viewed_pages']):
394 continue
395 if v['last_access'].tm_mday == cur_time.tm_mday:
396 stats['nb_visitors'] += 1
397 print stats
398
399 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
400
401 def _newHit(self, hit):
402 t = self._decodeTime(hit)
403
404 cur_time = self.meta_infos['last_time']
405
406 if cur_time == None:
407 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
408 self.analyse_started = True
409 else:
410 if not self.analyse_started:
411 if time.mktime(t) < time.mktime(cur_time):
412 return False
413 else:
414 self.analyse_started = True
415 if cur_time.tm_mon != t.tm_mon:
416 self._generateMonthStats()
417 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
418 elif cur_time.tm_mday != t.tm_mday:
419 self._generateDayStats()
420
421 self.meta_infos['last_time'] = t
422
423 if not self.meta_infos['start_analysis_time']:
424 self.meta_infos['start_analysis_time'] = t
425
426 if not self._decodeHTTPRequest(hit): return False
427
428 for k in hit.keys():
429 if hit[k] == '-' or hit[k] == '*':
430 hit[k] = ''
431
432 self._appendHit(hit)
433
434 return True
435
436 def start(self):
437 print '==> Load previous database'
438
439 self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
440 if self.meta_infos['last_time']:
441 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
442 else:
443 self._clearVisits()
444
445 self.meta_infos['start_analysis_time'] = None
446
447 self.cache_plugins = preloadPlugins(self.plugins, self)
448
449 print '==> Analysing log'
450
451 with open(conf.analyzed_filename) as f:
452 for l in f:
453 # print "line " + l
454
455 groups = self.log_re.match(l)
456
457 if groups:
458 if not self._newHit(groups.groupdict()):
459 break
460 else:
461 print "No match for " + l
462 #break
463
464 if self.analyse_started:
465 self._generateDayStats()
466 self._generateMonthStats()
467 del self.meta_infos['start_analysis_time']
468 self._serialize(self.meta_infos, conf.META_PATH)
469 else:
470 print '==> Analyse not started : nothing to do'
471 self._generateMonthStats()
472
473if __name__ == '__main__':
474 iwla = IWLA()
475 iwla.start()

Archive Download this file

Branches

Tags