iwla

iwla Commit Details

Date:2014-11-25 16:22:07 (8 years 10 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:7405cf237acfbc4fa6d4a7d5a5e3f1d6ae05b368
Parents: d5db763b48b02f0f24e760e83368738c73b2475a
Message:Do a more generic plugin : page_to_hit

Changes:
Dplugins/pre_analysis/soutade.py (full)
Aplugins/pre_analysis/page_to_hit.py (full)
Mconf.py (1 diff)
Miplugin.py (1 diff)
Miwla.py (6 diffs)

File differences

conf.py
1111
1212
1313
14
14
1515
1616
1717
1818
19
1920
2021
2122
DB_ROOT = './output/'
DISPLAY_ROOT = './output/'
pre_analysis_hooks = ['soutade', 'robots']
pre_analysis_hooks = ['page_to_hit', 'robots']
post_analysis_hooks = ['top_visitors', 'reverse_dns']
display_hooks = ['top_visitors']
reverse_dns_timeout = 0.2
page_to_hit_conf = [r'^.+/logo/$']
# pre_analysis_hooks = ['H002_soutade.py', 'H001_robot.py']
# post_analysis_hooks = ['top_visitors.py']
# display_hooks = ['top_visitors.py']
iplugin.py
3131
3232
3333
34
35
3436
3537
3638
def preloadPlugins(plugins, iwla):
cache_plugins = {}
print "==> Preload plugins"
for root in plugins.keys():
for plugin_filename in plugins[root]:
plugin_path = root + '.' + plugin_filename
iwla.py
11
22
3
43
54
65
7
8
96
107
118
......
126123
127124
128125
129
126
130127
131128
132129
......
160157
161158
162159
163
160
164161
165162
166163
......
347344
348345
349346
350
347
351348
352349
353350
......
370367
371368
372369
373
370
374371
375372
376373
......
378375
379376
380377
378
379
381380
382381
383382
#!/usr/bin/env python
import sys
import os
import re
import time
import glob
import imp
import pickle
import gzip
import importlib
remote_addr = hit['remote_addr']
if not remote_addr in self.current_analysis['visits'].keys():
self._createUser(hit)
self._createVisitor(hit)
return
super_hit = self.current_analysis['visits'][remote_addr]
else:
super_hit[hit_key] += 1
def _createUser(self, hit):
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
else:
if not self.analyse_started:
if time.mktime(cur_time) >= time.mktime(t):
return
return False
else:
self.analyse_started = True
if cur_time.tm_mon != t.tm_mon:
def start(self):
self.cache_plugins = preloadPlugins(self.plugins, self)
print '==> Analysing log'
print '==> Analyse previous database'
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
else:
self._clearVisits()
print '==> Analysing log'
with open(conf.analyzed_filename) as f:
for l in f:
# print "line " + l
plugins/pre_analysis/page_to_hit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisPageToHit(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.regexps = self.iwla.getConfValue('page_to_hit_conf', [])
if not self.regexps: return False
self.regexps = map(lambda(r): re.compile(r), self.regexps)
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
uri = p['extract_request']['extract_uri']
for r in self.regexps:
if r.match(uri):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
break
plugins/pre_analysis/soutade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
from iwla import IWLA
from iplugin import IPlugin
# Basic rule to detect robots
class IWLAPreAnalysisSoutade(IPlugin):
def __init__(self, iwla):
super(IWLAPreAnalysisSoutade, self).__init__(iwla)
self.API_VERSION = 1
def load(self):
# Remove logo from indefero
self.logo_re = re.compile(r'^.+/logo/$')
return True
def hook(self, iwla):
hits = iwla.getCurrentVisists()
for k in hits.keys():
super_hit = hits[k]
if super_hit['robot']: continue
for p in super_hit['requests']:
if not p['is_page']: continue
if int(p['status']) != 200: continue
if p['time_decoded'].tm_mday != super_hit['last_access'].tm_mday: continue
if self.logo_re.match(p['extract_request']['extract_uri']):
p['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1

Archive Download the corresponding diff file

Branches

Tags