iwla

iwla Git Source Tree

Root/plugins/pre_analysis/page_to_hit.py

Source at commit 4e02325733e5e8e4f5de2f0046e721f8da7abfff created 6 years 10 months ago.
By Gregory Soutade, Initial commit
1# -*- coding: utf-8 -*-
2#
3# Copyright Grégory Soutadé 2015
4
5# This file is part of iwla
6
7# iwla is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# iwla is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with iwla. If not, see <http://www.gnu.org/licenses/>.
19#
20
21import re
22
23from iwla import IWLA
24from iplugin import IPlugin
25
26"""
27Pre analysis hook
28Change page into hit and hit into page into statistics
29
30Plugin requirements :
31 None
32
33Conf values needed :
34 page_to_hit_conf*
35 hit_to_page_conf*
36
37Output files :
38 None
39
40Statistics creation :
41 None
42
43Statistics update :
44visits :
45 remote_addr =>
46 is_page
47
48Statistics deletion :
49 None
50"""
51
52class IWLAPreAnalysisPageToHit(IPlugin):
53
54 def __init__(self, iwla):
55 super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
56 self.API_VERSION = 1
57
58 def load(self):
59 # Page to hit
60 self.ph_regexps = self.iwla.getConfValue('page_to_hit_conf', [])
61 if not self.ph_regexps: return False
62 self.ph_regexps = map(lambda(r): re.compile(r), self.ph_regexps)
63
64 # Hit to page
65 self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
66 if not self.hp_regexps: return False
67 self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
68
69 return True
70
71 def hook(self):
72 hits = self.iwla.getCurrentVisists()
73
74 for (k, super_hit) in hits.items():
75 if super_hit['robot']: continue
76
77 for request in super_hit['requests'][::-1]:
78 if not self.iwla.isValidForCurrentAnalysis(request):
79 break
80
81 if not self.iwla.hasBeenViewed(request):
82 continue
83
84 uri = request['extract_request']['extract_uri']
85
86 if request['is_page']:
87 # Page to hit
88 for regexp in self.ph_regexps:
89 if regexp.match(uri):
90 #print '%s is a hit' % (uri )
91 request['is_page'] = False
92 super_hit['viewed_pages'] -= 1
93 super_hit['viewed_hits'] += 1
94 break
95 else:
96 # Hit to page
97 for regexp in self.hp_regexps:
98 if regexp.match(uri):
99 #print '%s is a page' % (uri )
100 request['is_page'] = True
101 super_hit['viewed_pages'] += 1
102 super_hit['viewed_hits'] -= 1
103 break

Archive Download this file

Branches

Tags