iwla

iwla Git Source Tree

Root/plugins/pre_analysis/page_to_hit.py

1# -*- coding: utf-8 -*-
2#
3# Copyright Grégory Soutadé 2015
4
5# This file is part of iwla
6
7# iwla is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# iwla is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with iwla. If not, see <http://www.gnu.org/licenses/>.
19#
20
21import re
22import logging
23
24from iwla import IWLA
25from iplugin import IPlugin
26
27"""
28Pre analysis hook
29Change page into hit and hit into page into statistics
30
31Plugin requirements :
32 None
33
34Conf values needed :
35 page_to_hit_conf*
36 hit_to_page_conf*
37
38Output files :
39 None
40
41Statistics creation :
42 None
43
44Statistics update :
45visits :
46 remote_addr =>
47 is_page
48
49Statistics deletion :
50 None
51"""
52
53class IWLAPreAnalysisPageToHit(IPlugin):
54
55 def __init__(self, iwla):
56 super(IWLAPreAnalysisPageToHit, self).__init__(iwla)
57 self.API_VERSION = 1
58
59 def load(self):
60 # Page to hit
61 self.ph_regexps = self.iwla.getConfValue('page_to_hit_conf', [])
62 self.ph_regexps = map(lambda r: re.compile(r), self.ph_regexps)
63
64 # Hit to page
65 self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
66 self.hp_regexps = map(lambda r: re.compile(r), self.hp_regexps)
67
68 self.logger = logging.getLogger(self.__class__.__name__)
69 return True
70
71 def hook(self):
72 hits = self.iwla.getCurrentVisits()
73
74 for (k, super_hit) in hits.items():
75 if super_hit['robot']: continue
76
77 for request in super_hit['requests'][::-1]:
78 if not self.iwla.isValidForCurrentAnalysis(request):
79 break
80
81 if not self.iwla.hasBeenViewed(request):
82 continue
83
84 uri = request['extract_request']['extract_uri']
85
86 day = request['time_decoded'].tm_mday
87 if request['is_page']:
88 # Page to hit
89 for regexp in self.ph_regexps:
90 if regexp.match(uri):
91 self.logger.debug('%s changed from page to hit' % (uri))
92 request['is_page'] = False
93 super_hit['viewed_pages'][day] -= 1
94 super_hit['viewed_hits'][day] = super_hit['viewed_hits'].get(day, 0) + 1
95 super_hit['viewed_pages'][0] -= 1
96 super_hit['viewed_hits'][0] += 1
97 break
98 else:
99 # Hit to page
100 for regexp in self.hp_regexps:
101 if regexp.match(uri):
102 self.logger.debug('%s changed from hit to page' % (uri))
103 request['is_page'] = True
104 super_hit['viewed_pages'][day] = super_hit['viewed_pages'].get(day, 0) + 1
105 super_hit['viewed_hits'][day] -= 1
106 super_hit['viewed_pages'][0] += 1
107 super_hit['viewed_hits'][0] -= 1
108 break

Archive Download this file

Branches

Tags