iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import sys
4import os
5import re
6import time
7import glob
8import imp
9import pickle
10import gzip
11import importlib
12
13from display import *
14
15from default_conf import *
16from conf import *
17
18class IWLA(object):
19
20 ANALYSIS_CLASS = 'HTTP'
21 API_VERSION = 1
22
23 def __init__(self):
24 print '==> Start'
25
26 self.meta_infos = {}
27 self.analyse_started = False
28 self.current_analysis = {}
29 self.cache_plugins = {}
30 self.display = DisplayHTMLBuild()
31 self.valid_visitors = None
32
33 self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format)
34 self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
35 self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
36 self.log_re = re.compile(self.log_format_extracted)
37 self.uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
38 self.plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks,
39 POST_HOOK_DIRECTORY : post_analysis_hooks,
40 DISPLAY_HOOK_DIRECTORY : display_hooks}
41
42 def _preloadPlugins(self):
43 ret = True
44 for root in self.plugins.keys():
45 for plugin_name in self.plugins[root]:
46 #p = root + '/' + plugin_name
47 p = root + '.' + plugin_name
48 try:
49 # fp, pathname, description = imp.find_module(plugin_name, [root])
50 # self.cache_plugins[p] = imp.load_module(p, fp, pathname, description)
51 #p = 'plugins.display.top_visitors'
52 #sys.path.append(root)
53 #self.cache_plugins[p] = importlib.import_module(plugin_name, root)
54 #sys.path.remove(root)
55 self.cache_plugins[p] = importlib.import_module(p)
56 mod = self.cache_plugins[p]
57 infos = mod.get_plugins_infos()
58 if infos['class'] != IWLA.ANALYSIS_CLASS or \
59 IWLA.API_VERSION < infos['min_version'] or\
60 (infos['max_version'] != -1 and (IWLA.API_VERSION > infos['max_version'])):
61 del self.cache_plugins[p]
62 elif not mod.load():
63 del self.cache_plugins[p]
64 except Exception as e:
65 print 'Error loading \'%s\' => %s' % (p, e)
66 ret = False
67 return ret
68
69 def _clearVisits(self):
70 self.current_analysis = {
71 'days_stats' : {},
72 'month_stats' : {},
73 'visits' : {}
74 }
75 self.valid_visitors = None
76 return self.current_analysis
77
78 def getDaysStats(self):
79 return self.current_analysis['days_stats']
80
81 def getMonthStats(self):
82 return self.current_analysis['month_stats']
83
84 def getCurrentVisists(self):
85 return self.current_analysis['visits']
86
87 def getValidVisitors(self):
88 return self.valid_visitors
89
90 def getDisplay(self):
91 return self.display
92
93 def _clearMeta(self):
94 self.meta_infos = {
95 'last_time' : None
96 }
97 return self.meta_infos
98
99 def _clearDisplay(self):
100 self.display = DisplayHTMLBuild()
101return self.display
102
103 def getDBFilename(self, time):
104 return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
105
106 def _serialize(self, obj, filename):
107 base = os.path.dirname(filename)
108 if not os.path.exists(base):
109 os.makedirs(base)
110
111 # TODO : remove return
112 return
113
114 with open(filename + '.tmp', 'wb+') as f:
115 pickle.dump(obj, f)
116 f.seek(0)
117 with gzip.open(filename, 'w') as fzip:
118 fzip.write(f.read())
119 os.remove(filename + '.tmp')
120
121 def _deserialize(self, filename):
122 if not os.path.exists(filename):
123 return None
124
125 with gzip.open(filename, 'r') as f:
126 return pickle.load(f)
127 return None
128
129 def _callPlugins(self, root, *args):
130 print '==> Call plugins (%s)' % root
131 for p in self.plugins[root]:
132 print '\t%s' % (p)
133 mod = self.cache_plugins[root + '.' + p]
134 mod.hook(*args)
135
136 def isPage(self, request):
137 for e in pages_extensions:
138 if request.endswith(e):
139 return True
140
141 return False
142
143 def _appendHit(self, hit):
144 remote_addr = hit['remote_addr']
145
146 if not remote_addr in self.current_analysis['visits'].keys():
147 self._createUser(hit)
148 return
149
150 super_hit = self.current_analysis['visits'][remote_addr]
151 super_hit['requests'].append(hit)
152 super_hit['bandwidth'] += int(hit['body_bytes_sent'])
153 super_hit['last_access'] = self.meta_infos['last_time']
154
155 request = hit['extract_request']
156
157 if 'extract_uri' in request.keys():
158 uri = request['extract_uri']
159 else:
160 uri = request['http_uri']
161
162 hit['is_page'] = self.isPage(uri)
163
164 # Don't count 3xx status
165 status = int(hit['status'])
166 if status >= 300 and status < 400: return
167
168 if super_hit['robot'] or\
169 not status in viewed_http_codes:
170 page_key = 'not_viewed_pages'
171 hit_key = 'not_viewed_hits'
172 else:
173 page_key = 'viewed_pages'
174 hit_key = 'viewed_hits'
175
176 if hit['is_page']:
177 super_hit[page_key] += 1
178 else:
179 super_hit[hit_key] += 1
180
181 def _createUser(self, hit):
182 super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
183 super_hit['remote_addr'] = hit['remote_addr']
184 super_hit['viewed_pages'] = 0
185 super_hit['viewed_hits'] = 0
186 super_hit['not_viewed_pages'] = 0
187 super_hit['not_viewed_hits'] = 0
188 super_hit['bandwidth'] = 0
189 super_hit['last_access'] = self.meta_infos['last_time']
190 super_hit['requests'] = []
191 super_hit['robot'] = False
192 super_hit['hit_only'] = 0
193 self._appendHit(hit)
194
195 def _decodeHTTPRequest(self, hit):
196 if not 'request' in hit.keys(): return False
197
198 groups = self.http_request_extracted.match(hit['request'])
199
200 if groups:
201 hit['extract_request'] = groups.groupdict()
202 uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
203 if uri_groups:
204 d = uri_groups.groupdict()
205 hit['extract_request']['extract_uri'] = d['extract_uri']
206 if 'extract_parameters' in d.keys():
207 hit['extract_request']['extract_parameters'] = d['extract_parameters']
208 else:
209 print "Bad request extraction " + hit['request']
210 return False
211
212 referer_groups = self.uri_re.match(hit['http_referer'])
213 if referer_groups:
214 referer = hit['extract_referer'] = referer_groups.groupdict()
215 return True
216
217 def _decodeTime(self, hit):
218 hit['time_decoded'] = time.strptime(hit['time_local'], time_format)
219
220 def getDisplayIndex(self):
221 cur_time = self.meta_infos['last_time']
222 filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
223
224 return self.display.getPage(filename)
225
226 def _generateDisplayDaysStat(self):
227 cur_time = self.meta_infos['last_time']
228 title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
229 filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
230 print '==> Generate display (%s)' % (filename)
231 page = DisplayHTMLPage(title, filename)
232
233 days = DisplayHTMLBlockTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Not viewed Bandwidth'])
234
235 keys = self.current_analysis['days_stats'].keys()
236 keys.sort()
237 nb_visits = 0
238 for k in keys:
239 stats = self.current_analysis['days_stats'][k]
240 row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
241 row = map(lambda(v): str(v), row)
242 days.appendRow(row)
243 nb_visits += stats['nb_visitors']
244
245 stats = self.current_analysis['month_stats']
246
247 nb_days = len(keys)
248 row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
249 if nb_days:
250 row = map(lambda(v): str(int(v/nb_days)), row)
251 else:
252 row = map(lambda(v): '0', row)
253
254 row[0] = 'Average'
255 days.appendRow(row)
256
257 row = ['Total', nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
258 row = map(lambda(v): str(v), row)
259 days.appendRow(row)
260 page.appendBlock(days)
261 self.display.addPage(page)
262
263 def _generateDisplay(self):
264 self._generateDisplayDaysStat()
265 self._callPlugins(DISPLAY_HOOK_DIRECTORY, self)
266 self.display.build(DISPLAY_ROOT)
267
268 def _generateStats(self, visits):
269 stats = {}
270 stats['viewed_bandwidth'] = 0
271 stats['not_viewed_bandwidth'] = 0
272 stats['viewed_pages'] = 0
273 stats['viewed_hits'] = 0
274 #stats['requests'] = set()
275 stats['nb_visitors'] = 0
276
277 for k in visits.keys():
278 super_hit = visits[k]
279 if super_hit['robot']:
280 stats['not_viewed_bandwidth'] += super_hit['bandwidth']
281 continue
282
283 #print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
284
285 if not super_hit['hit_only']:
286 stats['nb_visitors'] += 1
287 stats['viewed_bandwidth'] += super_hit['bandwidth']
288 stats['viewed_pages'] += super_hit['viewed_pages']
289 stats['viewed_hits'] += super_hit['viewed_hits']
290
291 # for p in super_hit['requests']:
292 # if not p['is_page']: continue
293 # req = p['extract_request']
294 # stats['requests'].add(req['extract_uri'])
295
296 return stats
297
298 def _generateMonthStats(self):
299 self._clearDisplay()
300
301 visits = self.current_analysis['visits']
302
303 stats = self._generateStats(visits)
304
305 cur_time = self.meta_infos['last_time']
306 print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
307 print stats
308
309 self.current_analysis['month_stats'] = stats
310
311 self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
312 self._callPlugins(POST_HOOK_DIRECTORY, self)
313
314 path = self.getDBFilename(cur_time)
315 if os.path.exists(path):
316 os.remove(path)
317
318 print "==> Serialize to %s" % path
319
320 self._serialize(self.current_analysis, path)
321
322 self._generateDisplay()
323
324 def _generateDayStats(self):
325 visits = self.current_analysis['visits']
326
327 self._callPlugins(PRE_HOOK_DIRECTORY, self)
328
329 stats = self._generateStats(visits)
330
331 cur_time = self.meta_infos['last_time']
332 print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
333
334 if cur_time.tm_mday > 1:
335 last_day = cur_time.tm_mday - 1
336 while last_day:
337 if last_day in self.current_analysis['days_stats'].keys():
338 break
339 last_day -= 1
340 if last_day:
341 for k in stats.keys():
342 stats[k] -= self.current_analysis['days_stats'][last_day][k]
343 stats['nb_visitors'] = 0
344 for k in visits.keys():
345 if visits[k]['robot']: continue
346 if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
347 stats['nb_visitors'] += 1
348 print stats
349
350 self.current_analysis['days_stats'][cur_time.tm_mday] = stats
351
352 def _newHit(self, hit):
353 self._decodeTime(hit)
354
355 t = hit['time_decoded']
356
357 cur_time = self.meta_infos['last_time']
358
359 if cur_time == None:
360 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
361 self.analyse_started = True
362 else:
363 if not self.analyse_started:
364 if time.mktime(cur_time) >= time.mktime(t):
365 return
366 else:
367 self.analyse_started = True
368 if cur_time.tm_mon != t.tm_mon:
369 self._generateMonthStats()
370 self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
371 elif cur_time.tm_mday != t.tm_mday:
372 self._generateDayStats()
373
374 self.meta_infos['last_time'] = t
375
376 if not self._decodeHTTPRequest(hit): return False
377
378 for k in hit.keys():
379 if hit[k] == '-': hit[k] = ''
380
381 self._appendHit(hit)
382
383 return True
384
385 def start(self):
386self._preloadPlugins()
387
388 print '==> Analysing log'
389
390 self.meta_infos = self._deserialize(META_PATH) or self._clearMeta()
391 if self.meta_infos['last_time']:
392 self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
393 else:
394 self._clearVisits()
395
396 with open(analyzed_filename) as f:
397 for l in f:
398 # print "line " + l
399
400 groups = self.log_re.match(l)
401
402 if groups:
403 if not self._newHit(groups.groupdict()):
404 break
405 else:
406 print "No match for " + l
407 #break
408
409 if self.analyse_started:
410 self._generateDayStats()
411 self._generateMonthStats()
412 self._serialize(self.meta_infos, META_PATH)
413 else:
414 print '==> Analyse not started : nothing to do'
415 self._generateMonthStats()
416
417if __name__ == '__main__':
418 iwla = IWLA()
419 iwla.start()

Archive Download this file

Branches

Tags