iwla

iwla Git Source Tree

Root/iwla.py

1#!/usr/bin/env python
2
3import os
4import re
5import time
6import glob
7import imp
8from robots import awstats_robots;
9
10print '==> Start'
11
12current_visit = {}
13
14log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
15 '"$request" $status $body_bytes_sent ' +\
16 '"$http_referer" "$http_user_agent"';
17
18log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format);
19log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted)
20http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
21#09/Nov/2014:06:35:16 +0100
22time_format = '%d/%b/%Y:%H:%M:%S +0100'
23#print "Log format : " + log_format_extracted
24
25log_re = re.compile(log_format_extracted)
26uri_re = re.compile(r'(?P<extract_uri>[^\?]*)\?(?P<extract_parameters>.*)')
27pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php']
28viewed_http_codes = [200]
29
30cur_time = None
31
32print '==> Generating robot dictionary'
33
34awstats_robots = map(lambda (x) : re.compile(x, re.IGNORECASE), awstats_robots)
35
36def isPage(request):
37 for e in pages_extensions:
38 if request.endswith(e):
39 return True
40
41 return False
42
43def appendHit(hit):
44 super_hit = current_visit[hit['remote_addr']]
45 super_hit['pages'].append(hit)
46 super_hit['bandwith'] += int(hit['body_bytes_sent'])
47
48 request = hit['extract_request']
49
50 if 'extract_uri' in request.keys():
51 uri = request['extract_uri']
52 else:
53 uri = request['http_uri']
54
55 hit['is_page'] = isPage(uri)
56
57 # Don't count redirect status
58 if int(hit['status']) == 302: return
59
60 if super_hit['robot'] or\
61 not int(hit['status']) in viewed_http_codes:
62 page_key = 'not_viewed_pages'
63 hit_key = 'not_viewed_hits'
64 else:
65 page_key = 'viewed_pages'
66 hit_key = 'viewed_hits'
67
68 if hit['is_page']:
69 super_hit[page_key] += 1
70 else:
71 super_hit[hit_key] += 1
72
73def createGeneric(hit):
74 super_hit = current_visit[hit['remote_addr']] = {}
75 super_hit['viewed_pages'] = 0;
76 super_hit['viewed_hits'] = 0;
77 super_hit['not_viewed_pages'] = 0;
78 super_hit['not_viewed_hits'] = 0;
79 super_hit['bandwith'] = 0;
80 super_hit['pages'] = [];
81
82 return super_hit
83
84def createUser(hit, robot):
85 super_hit = createGeneric(hit)
86 super_hit['robot'] = robot;
87 appendHit(hit)
88
89def isRobot(hit):
90 for r in awstats_robots:
91 if r.match(hit['http_user_agent']):
92 return True
93 return False
94
95def decode_http_request(hit):
96 if not 'request' in hit.keys(): return False
97
98 groups = http_request_extracted.match(hit['request'])
99
100 if groups:
101 hit['extract_request'] = groups.groupdict()
102 uri_groups = uri_re.match(hit['extract_request']['http_uri']);
103 if uri_groups:
104 hit['extract_request']['extract_uri'] = uri_groups.group('extract_uri')
105 hit['extract_request']['extract_parameters'] = uri_groups.group('extract_parameters')
106 else:
107 print "Bad request extraction " + hit['request']
108 return False
109
110 referer_groups = uri_re.match(hit['http_referer']);
111 if referer_groups:
112 hit['extract_referer']['extract_uri'] = referer_groups.group('extract_uri')
113 hit['extract_referer']['extract_parameters'] = referer_groups.group('extract_parameters')
114 return True
115
116def decode_time(hit):
117 t = hit['time_local']
118
119 hit['time_decoded'] = time.strptime(t, time_format)
120
121
122def newHit(hit):
123 global cur_time
124
125 if not decode_http_request(hit): return
126
127 for k in hit.keys():
128 if hit[k] == '-': hit[k] = ''
129
130 decode_time(hit)
131
132 t = hit['time_decoded']
133
134 current_visit['last_time'] = t
135
136 if cur_time == None:
137 cur_time = t
138 else:
139 if cur_time.tm_mday != t.tm_mday:
140 return False
141
142 remote_addr = hit['remote_addr']
143 if remote_addr in current_visit.keys():
144 appendHit(hit)
145 else:
146 createUser(hit, isRobot(hit))
147
148 return True
149
150print '==> Analysing log'
151f = open("access.log")
152for l in f:
153 # print "line " + l;
154
155 groups = log_re.match(l)
156
157 if groups:
158 if not newHit(groups.groupdict()):
159 break
160 else:
161 print "No match " + l
162f.close();
163
164print '==> Call plugins'
165plugins = glob.glob('./hooks_pre/*.py')
166plugins.sort()
167for p in plugins:
168 print '\t%s' % (p)
169 mod = imp.load_source('hook', p)
170 mod.hook(current_visit)
171
172for ip in current_visit.keys():
173 hit = current_visit[ip]
174 if hit['robot']: continue
175 print "%s =>" % (ip)
176 for k in hit.keys():
177 if k != 'pages':
178 print "\t%s : %s" % (k, current_visit[ip][k])

Archive Download this file

Branches

Tags