Do a more generic plugin : page_to_hit

This commit is contained in:
Grégory Soutadé 2014-11-25 16:22:07 +01:00
parent d5db763b48
commit 7405cf237a
5 changed files with 48 additions and 43 deletions

View File

@ -11,11 +11,12 @@ analyzed_filename = 'access.log'
DB_ROOT = './output/' DB_ROOT = './output/'
DISPLAY_ROOT = './output/' DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['soutade', 'robots'] pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['top_visitors', 'reverse_dns'] post_analysis_hooks = ['top_visitors', 'reverse_dns']
display_hooks = ['top_visitors'] display_hooks = ['top_visitors']
reverse_dns_timeout = 0.2 reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo/$']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py'] # pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py'] # post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py'] # display_hooks = ['top_visitors.py']

View File

@ -31,6 +31,8 @@ class IPlugin(object):
def preloadPlugins(plugins, iwla): def preloadPlugins(plugins, iwla):
cache_plugins = {} cache_plugins = {}
print "==> Preload plugins"
for root in plugins.keys(): for root in plugins.keys():
for plugin_filename in plugins[root]: for plugin_filename in plugins[root]:
plugin_path = root + '.' + plugin_filename plugin_path = root + '.' + plugin_filename

13
iwla.py
View File

@ -1,11 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys
import os import os
import re import re
import time import time
import glob
import imp
import pickle import pickle
import gzip import gzip
import importlib import importlib
@ -126,7 +123,7 @@ class IWLA(object):
remote_addr = hit['remote_addr'] remote_addr = hit['remote_addr']
if not remote_addr in self.current_analysis['visits'].keys(): if not remote_addr in self.current_analysis['visits'].keys():
self._createUser(hit) self._createVisitor(hit)
return return
super_hit = self.current_analysis['visits'][remote_addr] super_hit = self.current_analysis['visits'][remote_addr]
@ -160,7 +157,7 @@ class IWLA(object):
else: else:
super_hit[hit_key] += 1 super_hit[hit_key] += 1
def _createUser(self, hit): def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {} super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr'] super_hit['remote_addr'] = hit['remote_addr']
super_hit['viewed_pages'] = 0 super_hit['viewed_pages'] = 0
@ -347,7 +344,7 @@ class IWLA(object):
else: else:
if not self.analyse_started: if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t): if time.mktime(cur_time) >= time.mktime(t):
return return False
else: else:
self.analyse_started = True self.analyse_started = True
if cur_time.tm_mon != t.tm_mon: if cur_time.tm_mon != t.tm_mon:
@ -370,7 +367,7 @@ class IWLA(object):
def start(self): def start(self):
self.cache_plugins = preloadPlugins(self.plugins, self) self.cache_plugins = preloadPlugins(self.plugins, self)
print '==> Analysing log' print '==> Analyse previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta() self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']: if self.meta_infos['last_time']:
@ -378,6 +375,8 @@ class IWLA(object):
else: else:
self._clearVisits() self._clearVisits()
print '==> Analysing log'
with open(conf.analyzed_filename) as f: with open(conf.analyzed_filename) as f:
for l in f: for l in f:
# print "line " + l # print "line " + l

View File

@ -0,0 +1,38 @@
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisPageToHit(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.regexps = self.iwla.getConfValue('page_to_hit_conf', [])
if not self.regexps: return False
self.regexps = map(lambda(r): re.compile(r), self.regexps)
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
uri = p['extract_request']['extract_uri']
for r in self.regexps:
if r.match(uri):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break

View File

@ -1,35 +0,0 @@
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisSoutade(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisSoutade, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.logo_re = re.compile(r'^.+/logo/$')
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
if self.logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1