iwla

iwla Commit Details

Date:2019-08-30 07:50:54 (11 months 4 days ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:bb268114b2123210e70221e8e7a4198bf5e285fd
Parents: ed6ed68706172ca69c7d989c131240a70dd3b8f6
Message:

Make backup before compressing (low memory servers) Fix error : Call post hook plugins even in display only mode Don't compute unordered hits (remove pasts if they are found after current) Remove tags in stats diff Don't do geolocalisation is visitor is not valid Don't try to find search engine on robots Update robot check rules Add top_pages_diff plugin
Changes:
Cplugins/display/referers_diff.py → plugins/display/top_pages_diff.py
Miwla.py (3 diffs)
Mplugins/display/istats_diff.py (4 diffs)
Mplugins/post_analysis/ip_to_geo.py (1 diff)
Mplugins/post_analysis/referers.py (1 diff)
Mplugins/pre_analysis/robots.py (3 diffs)

File differences

iwla.py
252252
253253
254254
255
256
257
258
255259
256260
257
258261
259262
263
260264
265
266
261267
262268
263269
......
626632
627633
628634
635
636
629637
630638
631639
632640
633
634
635641
636
637
638642
639643
640644
......
701705
702706
703707
708
709
704710
705711
706712
if not os.path.exists(base):
os.makedirs(base)
# Make a backup in case of something fails
if os.path.exists(filename):
shutil.copy(filename, filename + '.bak')
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
cPickle.dump(obj, f)
os.fsync(f)
f.seek(0)
fzip.write(f.read())
os.fsync(fzip)
os.remove(filename + '.tmp')
if os.path.exists(filename + '.bak'):
os.remove(filename + '.bak')
def _deserialize(self, filename):
if not os.path.exists(filename):
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
if args.display_only:
self._generateDisplay()
return
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
if os.path.exists(path) and not self.dry_run:
os.remove(path)
self.logger.info("==> Serialize to %s" % (path))
self._serialize(self.current_analysis, path)
self.logger.debug("Not in time")
return False
self.analyse_started = True
if t < cur_time: # Don't accept past hits
return False
if cur_time.tm_mon != t.tm_mon:
self._generateDayStats()
self._generateMonthStats()
plugins/display/istats_diff.py
2222
2323
2424
25
2526
2627
2728
......
5455
5556
5657
58
5759
5860
5961
62
6063
6164
6265
......
6770
6871
6972
73
74
75
76
7077
7178
7279
......
8895
8996
9097
91
98
99
92100
93101
94
102
103
95104
96
105
97106
98107
99
100
101
108
109
110
111
112
113
114
115
from iplugin import IPlugin
from display import *
import logging
import re
"""
Display hook interface
self.month_stats_key = None
# Set >= if month_stats[self.month_stats_key] is a list or a tuple
self.stats_index = -1
self.display_index = 1
self.filename = None
self.block_name = None
self.logger = logging.getLogger(__name__)
self.tag_re = re.compile(r'<[^>]+>')
def load(self):
if not self.month_stats_key or not self.filename or\
self.cur_stats = {k:v for (k,v) in month_stats.get(self.month_stats_key, {}).items()}
return True
# from https://tutorialedge.net/python/removing-html-from-string/
def remove_tags(self, text):
return self.tag_re.sub('', text)
def hook(self):
display = self.iwla.getDisplay()
month_stats = self.iwla.getMonthStats()
if new_value:
if self.stats_index != -1:
if new_value[self.stats_index] != v[self.stats_index]:
stats_diff[k] = 'iwla_update'
diff_value = v[self.stats_index] - new_value[self.stats_index]
stats_diff[k] = ['iwla_update', diff_value]
else:
if new_value != v:
stats_diff[k] = 'iwla_update'
diff_value = v - new_value
stats_diff[k] = ['iwla_update', diff_value]
else:
stats_diff[k] = 'iwla_new'
stats_diff[k] = ['iwla_new', 0]
for (idx, row) in enumerate(block.rows):
for k in stats_diff.keys():
if k in row[0]:
block.setCellCSSClass(idx, 0, stats_diff[k])
clear_text = self.remove_tags(row[0])
if clear_text in stats_diff.keys():
(cls, diff) = stats_diff[clear_text]
block.setCellCSSClass(idx, 0, cls)
if diff:
value = block.getCellValue(idx, self.display_index)
value += ' (+%d)' % diff
block.setCellValue(idx, self.display_index, value)
plugins/display/top_pages_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2018
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
from iwla import IWLA
from istats_diff import IWLADisplayStatsDiff
from display import *
"""
Display hook
Enlight new and updated pages in in top_pages.html
Plugin requirements :
display/top_pages
Conf values needed :
None
Output files :
None
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
"""
class IWLADisplayTopPagesDiff(IWLADisplayStatsDiff):
def __init__(self, iwla):
super(IWLADisplayTopPagesDiff, self).__init__(iwla)
self.API_VERSION = 1
self.requires = ['IWLADisplayTopPages']
self.month_stats_key = u'top_pages'
self.filename = u'top_pages.html'
self.block_name = self.iwla._(u'All Pages')
def load(self):
if not self.iwla.getConfValue('create_all_pages_page', True):
return False
return super(IWLADisplayTopPagesDiff, self).load()
plugins/post_analysis/ip_to_geo.py
8282
8383
8484
85
86
8587
8688
8789
(_, cc) = self.iptogeo.ip_to_geo(ip)
cc = cc and cc or 'ip'
visitor['country_code'] = cc
if not self.iwla.isValidVisitor(visitor):
continue
if cc in geo.keys():
geo[cc] += 1
else:
plugins/post_analysis/referers.py
140140
141141
142142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
143
163144
164145
165146
166
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
167169
168170
169171
uri = r['extract_referer']['extract_uri']
if self.own_domain_re.match(uri): continue
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
elif super_hit['robot']:
if super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
dictionary = referers
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:
plugins/pre_analysis/robots.py
109109
110110
111111
112
113
114
115
116
117
118
119
120
121
112122
113123
114124
......
116126
117127
118128
119
129
120130
121131
122132
123133
124
134
125135
126136
127
137
128138
129139
130140
......
132142
133143
134144
135
145
136146
137147
138148
# continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 3) no pages and not hit --> robot
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 4) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
not_found_pages = 0
for hit in super_hit['requests']:
# 3) /robots.txt read
# 5) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
self._setRobot(k, super_hit)
break
if int(hit['status']) == 404:
if int(hit['status']) == 404 or int(hit['status']) == 403:
not_found_pages += 1
# 4) Any referer for hits
# 6) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
self._setRobot(k, super_hit)
continue
# 5) more than 10 404 pages
# 7) more than 10 404/403 pages
if not_found_pages > 10:
self._setRobot(k, super_hit)
continue

Archive Download the corresponding diff file

Branches

Tags