iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import re
5import time
6import glob
7import imp
8import pickle
9import gzip
10
11from robots import awstats_robots;
12
13print '==> Start'
14
15meta_visit = {'last_time':None}
16analyse_started = False
17current_visits = {}
18cache_plugins = {}
19
20log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
21 '"$request" $status $body_bytes_sent ' +\
22 '"$http_referer" "$http_user_agent"';
23
24log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
25log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
26http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
27#09/Nov/2014:06:35:16 +0100
28time_format = '%d/%b/%Y:%H:%M:%S +0100'
29#print "Log format : " + log_format_extracted
30
31log_re = re.compile(log_format_extracted)
32uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
33pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
34viewed_http_codes = [200]
35
36PRE_HOOK_DIRECTORY = './hooks/pre_analysis/*.py'
37POST_HOOK_DIRECTORY = './hooks/post_analysis/*.py'
38DB_ROOT = './output/'
39META_PATH = DB_ROOT + 'meta.db'
40DB_FILENAME = 'iwla.db'
41
42print '==> Generating robot dictionary'
43
44awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
45
46def createEmptyVisits():
47 visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}}
48 return visits
49
50def createEmptyMeta():
51 meta = {'last_time':None}
52 return meta
53
54def getDBFilename(time):
55 return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
56
57def serialize(obj, filename):
58 base = os.path.dirname(filename)
59 if not os.path.exists(base):
60 os.makedirs(base)
61
62 # TODO : remove return
63 return
64
65 with open(filename + '.tmp', 'wb+') as f:
66 pickle.dump(obj, f)
67 f.seek(0)
68 with gzip.open(filename, 'w') as fzip:
69 fzip.write(f.read())
70 os.remove(filename + '.tmp')
71
72def deserialize(filename):
73 if not os.path.exists(filename):
74 return None
75
76 with gzip.open(filename, 'r') as f:
77 return pickle.load(f)
78 return None
79
80def callPlugins(path, *kwargs):
81 print '==> Call plugins (%s)' % path
82 plugins = glob.glob(path)
83 plugins.sort()
84 for p in plugins:
85 print '\t%s' % (p)
86 if not p in cache_plugins:
87 mod = imp.load_source('hook', p)
88 cache_plugins[p] = mod
89 else:
90 mod = cache_plugins[p]
91 mod.hook(*kwargs)
92
93def isPage(request):
94 for e in pages_extensions:
95 if request.endswith(e):
96 return True
97
98 return False
99
100def appendHit(hit):
101 remote_addr = hit['remote_addr']
102
103 if not remote_addr in current_visits['visits'].keys():
104 createUser(hit)
105 return
106
107 super_hit = current_visits['visits'][remote_addr]
108 super_hit['pages'].append(hit)
109 super_hit['bandwith'] += int(hit['body_bytes_sent'])
110 super_hit['last_access'] = meta_visit['last_time']
111
112 request = hit['extract_request']
113
114 if 'extract_uri' in request.keys():
115 uri = request['extract_uri']
116 else:
117 uri = request['http_uri']
118
119 hit['is_page'] = isPage(uri)
120
121 # Don't count 3xx status
122 status = int(hit['status'])
123 if status >= 300 and status < 400: return
124
125 if super_hit['robot'] or\
126 not int(hit['status']) in viewed_http_codes:
127 page_key = 'not_viewed_pages'
128 hit_key = 'not_viewed_hits'
129 else:
130 page_key = 'viewed_pages'
131 hit_key = 'viewed_hits'
132
133 if hit['is_page']:
134 super_hit[page_key] += 1
135 else:
136 super_hit[hit_key] += 1
137
138def createUser(hit):
139 super_hit = current_visits['visits'][hit['remote_addr']] = {}
140 super_hit['viewed_pages'] = 0;
141 super_hit['viewed_hits'] = 0;
142 super_hit['not_viewed_pages'] = 0;
143 super_hit['not_viewed_hits'] = 0;
144 super_hit['bandwith'] = 0;
145 super_hit['last_access'] = meta_visit['last_time']
146 super_hit['pages'] = [];
147 super_hit['robot'] = isRobot(hit);
148 appendHit(hit)
149
150def isRobot(hit):
151 for r in awstats_robots:
152 if r.match(hit['http_user_agent']):
153 return True
154 return False
155
156def decodeHTTPRequest(hit):
157 if not 'request' in hit.keys(): return False
158
159 groups = http_request_extracted.match(hit['request'])
160
161 if groups:
162 hit['extract_request'] = groups.groupdict()
163 uri_groups = uri_re.match(hit['extract_request']['http_uri']);
164 if uri_groups:
165 d = uri_groups.groupdict()
166 hit['extract_request']['extract_uri'] = d['extract_uri']
167 if 'extract_parameters' in d.keys():
168 hit['extract_request']['extract_parameters'] = d['extract_parameters']
169 else:
170 print "Bad request extraction " + hit['request']
171 return False
172
173 referer_groups = uri_re.match(hit['http_referer']);
174 if referer_groups:
175 referer = hit['extract_referer'] = referer_groups.groupdict()
176 return True
177
178def decodeTime(hit):
179 t = hit['time_local']
180
181 hit['time_decoded'] = time.strptime(t, time_format)
182
183
184def generateStats(visits):
185 stats = {}
186 stats['viewed_bandwidth'] = 0
187 stats['not_viewed_bandwidth'] = 0
188 stats['viewed_pages'] = 0
189 stats['viewed_hits'] = 0
190 #stats['pages'] = set()
191 stats['nb_visitors'] = 0
192
193 for k in visits.keys():
194 super_hit = visits[k]
195 if super_hit['robot']:
196 stats['not_viewed_bandwidth'] += super_hit['bandwith']
197 continue
198
199 print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
200
201 stats['nb_visitors'] += 1
202 stats['viewed_bandwidth'] += super_hit['bandwith']
203 stats['viewed_pages'] += super_hit['viewed_pages']
204 stats['viewed_hits'] += super_hit['viewed_hits']
205
206 # for p in super_hit['pages']:
207 # if not p['is_page']: continue
208 # req = p['extract_request']
209 # stats['pages'].add(req['extract_uri'])
210
211 return stats
212
213def generateMonthStats():
214 visits = current_visits['visits']
215
216 stats = generateStats(visits)
217
218 cur_time = meta_visit['last_time']
219 print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
220 print stats
221
222 valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
223 callPlugins(POST_HOOK_DIRECTORY, valid_visitors)
224
225 current_visits['month_stats'] = stats
226
227 path = getDBFilename(cur_time)
228 if os.path.exists(path):
229 os.remove(path)
230
231 print "==> Serialize to %s" % path
232
233 serialize(current_visits, path)
234
235def generateDayStats():
236 visits = current_visits['visits']
237
238 callPlugins(PRE_HOOK_DIRECTORY, visits)
239
240 stats = generateStats(visits)
241
242 cur_time = meta_visit['last_time']
243 print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
244
245 if cur_time.tm_mday > 1:
246 last_day = cur_time.tm_mday - 1
247 while last_day:
248 if last_day in current_visits['days_stats'].keys():
249 break
250 last_day -= 1
251 if last_day:
252 for k in stats.keys():
253 stats[k] -= current_visits['days_stats'][last_day][k]
254 stats['nb_visitors'] = 0
255 for k in visits.keys():
256 if visits[k]['robot']: continue
257 if visits[k]['last_access'].tm_mday == cur_time.tm_mday:
258 stats['nb_visitors'] += 1
259 print stats
260
261 current_visits['days_stats'][cur_time.tm_mday] = stats
262
263def newHit(hit):
264 global current_visits
265 global analyse_started
266
267 decodeTime(hit)
268
269 t = hit['time_decoded']
270
271 cur_time = meta_visit['last_time']
272
273 if cur_time == None:
274 current_visits = deserialize(getDBFilename(t))
275 if not current_visits: current_visits = createEmptyVisits()
276 analyse_started = True
277 else:
278 if not analyse_started:
279 if time.mktime(cur_time) >= time.mktime(t):
280 return
281 else:
282 analyse_started = True
283 current_visits = deserialize(getDBFilename(t))
284 if not current_visits: current_visits = createEmptyVisits()
285 if cur_time.tm_mon != t.tm_mon:
286 generateMonthStats()
287 current_visits = deserialize(getDBFilename(t))
288 if not current_visits: current_visits = createEmptyVisits()
289 elif cur_time.tm_mday != t.tm_mday:
290 generateDayStats()
291
292 meta_visit['last_time'] = t
293
294 if not decodeHTTPRequest(hit): return False
295
296 for k in hit.keys():
297 if hit[k] == '-': hit[k] = ''
298
299 appendHit(hit)
300
301 return True
302
303print '==> Analysing log'
304
305meta_visit = deserialize(META_PATH)
306if not meta_visit:
307 meta_visit = createEmptyMeta()
308
309current_visits = createEmptyVisits()
310
311f = open("access.log")
312for l in f:
313 # print "line " + l;
314
315 groups = log_re.match(l)
316
317 if groups:
318 if not newHit(groups.groupdict()):
319 break
320 else:
321 print "No match " + l
322f.close();
323
324if analyse_started:
325 generateDayStats()
326 generateMonthStats()
327 serialize(meta_visit, META_PATH)
328else:
329 print '==> Analyse not started : nothing to do'

Archive Download this file

Branches

Tags