Start using classes

This commit is contained in:
Gregory Soutade 2014-11-21 14:46:12 +01:00
parent 7dada493ab
commit c3c201fda1
1 changed files with 319 additions and 326 deletions

303
iwla.py
View File

@ -10,94 +10,89 @@ import gzip
from display import * from display import *
# Default configuration from default_conf import *
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
log_format = '$server_name:$server_port $remote_addr - $remote_user [$time_local] ' +\
'"$request" $status $body_bytes_sent ' +\
'"$http_referer" "$http_user_agent"'
time_format = '%d/%b/%Y:%H:%M:%S +0100'
pre_analysis_hooks = []
post_analysis_hooks = []
display_hooks = []
from conf import * from conf import *
print '==> Start' class IWLA(object):
meta_visit = {} ANALYSIS_CLASS = 'HTTP'
analyse_started = False API_VERSION = 1
current_visits = {}
cache_plugins = {}
display = {}
log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format) def __init__(self):
log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', log_format_extracted) print '==> Start'
http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
log_re = re.compile(log_format_extracted) self.meta_infos = {}
uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?') self.analyse_started = False
pages_extensions = ['/', 'html', 'xhtml', 'py', 'pl', 'rb', 'php'] self.current_analysis = {}
viewed_http_codes = [200] self.cache_plugins = {}
self.display = {}
self.valid_visitors = None
HOOKS_ROOT = './plugins/' self.log_format_extracted = re.sub(r'([^\$\w])', r'\\\g<1>', log_format)
PRE_HOOK_DIRECTORY = HOOKS_ROOT + 'pre_analysis/' self.log_format_extracted = re.sub(r'\$(\w+)', '(?P<\g<1>>.+)', self.log_format_extracted)
POST_HOOK_DIRECTORY = HOOKS_ROOT + 'post_analysis/' self.http_request_extracted = re.compile(r'(?P<http_method>\S+) (?P<http_uri>\S+) (?P<http_version>\S+)')
DISPLAY_HOOK_DIRECTORY = HOOKS_ROOT + 'display/' self.log_re = re.compile(self.log_format_extracted)
META_PATH = DB_ROOT + 'meta.db' self.uri_re = re.compile(r'(?P<extract_uri>[^\?]*)[\?(?P<extract_parameters>.*)]?')
DB_FILENAME = 'iwla.db' self.plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks,
POST_HOOK_DIRECTORY : post_analysis_hooks,
DISPLAY_HOOK_DIRECTORY : display_hooks}
plugins = {PRE_HOOK_DIRECTORY : pre_analysis_hooks, POST_HOOK_DIRECTORY : post_analysis_hooks, DISPLAY_HOOK_DIRECTORY : display_hooks} def _preloadPlugins(self):
ANALYSIS_CLASS = 'HTTP'
API_VERSION = 1
def preloadPlugins():
ret = True ret = True
for root in plugins.keys(): for root in self.plugins.keys():
for plugin_name in plugins[root]: for plugin_name in self.plugins[root]:
p = root + '/' + plugin_name p = root + '/' + plugin_name
try: try:
fp, pathname, description = imp.find_module(plugin_name, [root]) fp, pathname, description = imp.find_module(plugin_name, [root])
cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description) self.cache_plugins[p] = imp.load_module(plugin_name, fp, pathname, description)
#cache_plugins[p] = imp.load_module(p,None,p,("py","r",imp.PKG_DIRECTORY)) mod = self.cache_plugins[p]
#cache_plugins[p] = imp.load_source(p, p)
mod = cache_plugins[p]
#print dir(mod)
#print "Register %s -> %s" % (p, mod)
infos = mod.get_plugins_infos() infos = mod.get_plugins_infos()
if infos['class'] != ANALYSIS_CLASS or \ if infos['class'] != IWLA.ANALYSIS_CLASS or \
API_VERSION < infos['min_version'] or\ IWLA.API_VERSION < infos['min_version'] or\
(infos['max_version'] != -1 and (API_VERSION > infos['max_version'])): (infos['max_version'] != -1 and (IWLA.API_VERSION > infos['max_version'])):
del cache_plugins[p] del self.cache_plugins[p]
elif not mod.load(): elif not mod.load():
del cache_plugins[p] del self.cache_plugins[p]
except Exception as e: except Exception as e:
print 'Error loading \'%s\' => %s' % (p, e) print 'Error loading \'%s\' => %s' % (p, e)
ret = False ret = False
return ret return ret
def _clearVisits(self):
self.current_analysis = {
'days_stats' : {},
'month_stats' : {},
'visits' : {}
}
self.valid_visitors = None
return self.current_analysis
def createEmptyVisits(): def getDaysStats(self):
visits = {'days_stats' : {}, 'month_stats' : {}, 'visits' : {}} return self.current_analysis['days_stats']
return visits
def createEmptyMeta(): def getMonthStatsStats(self):
meta = {'last_time' : None} return self.current_analysis['month_stats']
return meta
def createEmptyDisplay(): def getCurrentVisists(self):
display = {} return self.current_analysis['visits']
return display
def getDBFilename(time): def getValidVisitors(self):
return self.current_analysis['visits']
def _clearMeta(self):
self.meta_infos = {
'last_time' : None
}
return self.meta_infos
def _clearDisplay(self):
self.display = {}
return self.display
def getDBFilename(self, time):
return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME) return (DB_ROOT + '%d/%d_%s') % (time.tm_year, time.tm_mon, DB_FILENAME)
def serialize(obj, filename): def _serialize(self, obj, filename):
base = os.path.dirname(filename) base = os.path.dirname(filename)
if not os.path.exists(base): if not os.path.exists(base):
os.makedirs(base) os.makedirs(base)
@ -112,7 +107,7 @@ def serialize(obj, filename):
fzip.write(f.read()) fzip.write(f.read())
os.remove(filename + '.tmp') os.remove(filename + '.tmp')
def deserialize(filename): def _deserialize(self, filename):
if not os.path.exists(filename): if not os.path.exists(filename):
return None return None
@ -120,31 +115,31 @@ def deserialize(filename):
return pickle.load(f) return pickle.load(f)
return None return None
def callPlugins(root, *args): def _callPlugins(self, root, *args):
print '==> Call plugins (%s)' % root print '==> Call plugins (%s)' % root
for p in plugins[root]: for p in self.plugins[root]:
print '\t%s' % (p) print '\t%s' % (p)
mod = cache_plugins[root + '/' + p] mod = self.cache_plugins[root + '/' + p]
mod.hook(*args) mod.hook(*args)
def isPage(request): def isPage(self, request):
for e in pages_extensions: for e in pages_extensions:
if request.endswith(e): if request.endswith(e):
return True return True
return False return False
def appendHit(hit): def _appendHit(self, hit):
remote_addr = hit['remote_addr'] remote_addr = hit['remote_addr']
if not remote_addr in current_visits['visits'].keys(): if not remote_addr in self.current_analysis['visits'].keys():
createUser(hit) self._createUser(hit)
return return
super_hit = current_visits['visits'][remote_addr] super_hit = self.current_analysis['visits'][remote_addr]
super_hit['requests'].append(hit) super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent']) super_hit['bandwidth'] += int(hit['body_bytes_sent'])
super_hit['last_access'] = meta_visit['last_time'] super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request'] request = hit['extract_request']
@ -153,7 +148,7 @@ def appendHit(hit):
else: else:
uri = request['http_uri'] uri = request['http_uri']
hit['is_page'] = isPage(uri) hit['is_page'] = self.isPage(uri)
# Don't count 3xx status # Don't count 3xx status
status = int(hit['status']) status = int(hit['status'])
@ -172,28 +167,28 @@ def appendHit(hit):
else: else:
super_hit[hit_key] += 1 super_hit[hit_key] += 1
def createUser(hit): def _createUser(self, hit):
super_hit = current_visits['visits'][hit['remote_addr']] = {} super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_addr'] = hit['remote_addr']
super_hit['viewed_pages'] = 0 super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0 super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0 super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0 super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0 super_hit['bandwidth'] = 0
super_hit['last_access'] = meta_visit['last_time'] super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = [] super_hit['requests'] = []
super_hit['robot'] = False super_hit['robot'] = False
super_hit['hit_only'] = 0 super_hit['hit_only'] = 0
appendHit(hit) self._appendHit(hit)
def decodeHTTPRequest(hit): def _decodeHTTPRequest(self, hit):
if not 'request' in hit.keys(): return False if not 'request' in hit.keys(): return False
groups = http_request_extracted.match(hit['request']) groups = self.http_request_extracted.match(hit['request'])
if groups: if groups:
hit['extract_request'] = groups.groupdict() hit['extract_request'] = groups.groupdict()
uri_groups = uri_re.match(hit['extract_request']['http_uri']) uri_groups = self.uri_re.match(hit['extract_request']['http_uri'])
if uri_groups: if uri_groups:
d = uri_groups.groupdict() d = uri_groups.groupdict()
hit['extract_request']['extract_uri'] = d['extract_uri'] hit['extract_request']['extract_uri'] = d['extract_uri']
@ -203,41 +198,39 @@ def decodeHTTPRequest(hit):
print "Bad request extraction " + hit['request'] print "Bad request extraction " + hit['request']
return False return False
referer_groups = uri_re.match(hit['http_referer']) referer_groups = self.uri_re.match(hit['http_referer'])
if referer_groups: if referer_groups:
referer = hit['extract_referer'] = referer_groups.groupdict() referer = hit['extract_referer'] = referer_groups.groupdict()
return True return True
def decodeTime(hit): def _decodeTime(self, hit):
t = hit['time_local'] hit['time_decoded'] = time.strptime(hit['time_local'], time_format)
hit['time_decoded'] = time.strptime(t, time_format) def getDisplayIndex(self):
cur_time = self.meta_infos['last_time']
def getDisplayIndex():
cur_time = meta_visit['last_time']
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
return display.get(filename, None) return self.display.get(filename, None)
def generateDisplayDaysStat(): def _generateDisplayDaysStat(self):
cur_time = meta_visit['last_time'] cur_time = self.meta_infos['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year)
filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon) filename = '%d/index_%d.html' % (cur_time.tm_year, cur_time.tm_mon)
page = createPage(display, filename, title) page = createPage(self.display, filename, title)
days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth']) days = createTable('By day', ['Day', 'Visits', 'Pages', 'Hits', 'Bandwidth', 'Robot Bandwidth'])
keys = current_visits['days_stats'].keys() keys = self.current_analysis['days_stats'].keys()
keys.sort() keys.sort()
nb_visits = 0 nb_visits = 0
for k in keys: for k in keys:
stats = current_visits['days_stats'][k] stats = self.current_analysis['days_stats'][k]
row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] row = [k, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
row = map(lambda(v): str(v), row) row = map(lambda(v): str(v), row)
appendRowToTable(days, row) appendRowToTable(days, row)
nb_visits += stats['nb_visitors'] nb_visits += stats['nb_visitors']
stats = current_visits['month_stats'] stats = self.current_analysis['month_stats']
nb_days = len(keys) nb_days = len(keys)
row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']] row = [0, nb_visits, stats['viewed_pages'], stats['viewed_hits'], stats['viewed_bandwidth'], stats['not_viewed_bandwidth']]
@ -254,12 +247,12 @@ def generateDisplayDaysStat():
appendRowToTable(days, row) appendRowToTable(days, row)
appendBlockToPage(page, days) appendBlockToPage(page, days)
def generateDisplay(): def _generateDisplay(self):
generateDisplayDaysStat() self._generateDisplayDaysStat()
callPlugins(DISPLAY_HOOK_DIRECTORY, current_visits, display) self._callPlugins(DISPLAY_HOOK_DIRECTORY, self.current_analysis, self.display)
buildPages(DISPLAY_ROOT, display) buildPages(DISPLAY_ROOT, self.display)
def generateStats(visits): def _generateStats(self, visits):
stats = {} stats = {}
stats['viewed_bandwidth'] = 0 stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0
@ -289,51 +282,51 @@ def generateStats(visits):
return stats return stats
def generateMonthStats(): def _generateMonthStats(self):
display = createEmptyDisplay() self._clearDisplay()
visits = current_visits['visits'] visits = self.current_analysis['visits']
stats = generateStats(visits) stats = self._generateStats(visits)
cur_time = meta_visit['last_time'] cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats print stats
valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']} self.valid_visitors = {k: v for (k,v) in visits.items() if not visits[k]['robot']}
callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats) self._callPlugins(POST_HOOK_DIRECTORY, valid_visitors, stats)
current_visits['month_stats'] = stats self.current_analysis['month_stats'] = stats
path = getDBFilename(cur_time) path = self.getDBFilename(cur_time)
if os.path.exists(path): if os.path.exists(path):
os.remove(path) os.remove(path)
print "==> Serialize to %s" % path print "==> Serialize to %s" % path
serialize(current_visits, path) self._serialize(self.current_analysis, path)
generateDisplay() self._generateDisplay()
def generateDayStats(): def _generateDayStats(self):
visits = current_visits['visits'] visits = self.current_analysis['visits']
callPlugins(PRE_HOOK_DIRECTORY, visits) self._callPlugins(PRE_HOOK_DIRECTORY, visits)
stats = generateStats(visits) stats = self._generateStats(visits)
cur_time = meta_visit['last_time'] cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
if cur_time.tm_mday > 1: if cur_time.tm_mday > 1:
last_day = cur_time.tm_mday - 1 last_day = cur_time.tm_mday - 1
while last_day: while last_day:
if last_day in current_visits['days_stats'].keys(): if last_day in self.current_analysis['days_stats'].keys():
break break
last_day -= 1 last_day -= 1
if last_day: if last_day:
for k in stats.keys(): for k in stats.keys():
stats[k] -= current_visits['days_stats'][last_day][k] stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0 stats['nb_visitors'] = 0
for k in visits.keys(): for k in visits.keys():
if visits[k]['robot']: continue if visits[k]['robot']: continue
@ -341,71 +334,71 @@ def generateDayStats():
stats['nb_visitors'] += 1 stats['nb_visitors'] += 1
print stats print stats
current_visits['days_stats'][cur_time.tm_mday] = stats self.current_analysis['days_stats'][cur_time.tm_mday] = stats
def newHit(hit): def _newHit(self, hit):
global current_visits self._decodeTime(hit)
global analyse_started
decodeTime(hit)
t = hit['time_decoded'] t = hit['time_decoded']
cur_time = meta_visit['last_time'] cur_time = self.meta_infos['last_time']
if cur_time == None: if cur_time == None:
current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
analyse_started = True self.analyse_started = True
else: else:
if not analyse_started: if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t): if time.mktime(cur_time) >= time.mktime(t):
return return
else: else:
analyse_started = True self.analyse_started = True
if cur_time.tm_mon != t.tm_mon: if cur_time.tm_mon != t.tm_mon:
generateMonthStats() self._generateMonthStats()
current_visits = deserialize(getDBFilename(t)) or createEmptyVisits() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
elif cur_time.tm_mday != t.tm_mday: elif cur_time.tm_mday != t.tm_mday:
generateDayStats() self._generateDayStats()
meta_visit['last_time'] = t self.meta_infos['last_time'] = t
if not decodeHTTPRequest(hit): return False if not self._decodeHTTPRequest(hit): return False
for k in hit.keys(): for k in hit.keys():
if hit[k] == '-': hit[k] = '' if hit[k] == '-': hit[k] = ''
appendHit(hit) self._appendHit(hit)
return True return True
preloadPlugins() def start(self):
self._preloadPlugins()
print '==> Analysing log' print '==> Analysing log'
meta_visit = deserialize(META_PATH) or createEmptyMeta() self.meta_infos = self._deserialize(META_PATH) or self._clearMeta()
if meta_visit['last_time']: if self.meta_infos['last_time']:
current_visits = deserialize(getDBFilename(meta_visit['last_time'])) or createEmptyVisits() self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
else: else:
current_visits = createEmptyVisits() self._clearVisits()
f = open(analyzed_filename) with open(analyzed_filename) as f:
for l in f: for l in f:
# print "line " + l # print "line " + l
groups = log_re.match(l) groups = self.log_re.match(l)
if groups: if groups:
if not newHit(groups.groupdict()): if not self._newHit(groups.groupdict()):
break break
else: else:
print "No match " + l print "No match for " + l
f.close()
if analyse_started: if self.analyse_started:
generateDayStats() self._generateDayStats()
generateMonthStats() self._generateMonthStats()
serialize(meta_visit, META_PATH) self._serialize(meta_infos, META_PATH)
else: else:
print '==> Analyse not started : nothing to do' print '==> Analyse not started : nothing to do'
generateMonthStats() self._generateMonthStats()
iwla = IWLA()
iwla.start()