Make backup before compressing (low memory servers)

Fix error : Call post hook plugins even in display only mode
Don't compute unordered hits (remove pasts if they are found after current)
Remove tags in stats diff
Don't do geolocalisation is visitor is not valid
Don't try to find search engine on robots
Update robot check rules
Add top_pages_diff plugin
This commit is contained in:
Gregory Soutade 2019-08-30 07:50:54 +02:00
parent ed6ed68706
commit bb268114b2
6 changed files with 131 additions and 36 deletions

16
iwla.py
View File

@ -252,12 +252,18 @@ class IWLA(object):
if not os.path.exists(base):
os.makedirs(base)
# Make a backup in case of something fails
if os.path.exists(filename):
shutil.copy(filename, filename + '.bak')
with open(filename + '.tmp', 'wb+') as f, self._openDB(filename, 'w') as fzip:
cPickle.dump(obj, f)
os.fsync(f)
f.seek(0)
fzip.write(f.read())
os.fsync(fzip)
os.remove(filename + '.tmp')
if os.path.exists(filename + '.bak'):
os.remove(filename + '.bak')
def _deserialize(self, filename):
if not os.path.exists(filename):
@ -626,15 +632,13 @@ class IWLA(object):
duplicated_stats['nb_visitors'] = stats['nb_visitors'] = len(self.valid_visitors.keys())
self._callPlugins(conf.POST_HOOK_DIRECTORY)
if args.display_only:
self._generateDisplay()
return
self._callPlugins(conf.POST_HOOK_DIRECTORY)
path = self.getDBFilename(cur_time)
if os.path.exists(path) and not self.dry_run:
os.remove(path)
self.logger.info("==> Serialize to %s" % (path))
self._serialize(self.current_analysis, path)
@ -701,6 +705,8 @@ class IWLA(object):
self.logger.debug("Not in time")
return False
self.analyse_started = True
if t < cur_time: # Don't accept past hits
return False
if cur_time.tm_mon != t.tm_mon:
self._generateDayStats()
self._generateMonthStats()

View File

@ -22,6 +22,7 @@ from iwla import IWLA
from iplugin import IPlugin
from display import *
import logging
import re
"""
Display hook interface
@ -54,9 +55,11 @@ class IWLADisplayStatsDiff(IPlugin):
self.month_stats_key = None
# Set >= if month_stats[self.month_stats_key] is a list or a tuple
self.stats_index = -1
self.display_index = 1
self.filename = None
self.block_name = None
self.logger = logging.getLogger(__name__)
self.tag_re = re.compile(r'<[^>]+>')
def load(self):
if not self.month_stats_key or not self.filename or\
@ -67,6 +70,10 @@ class IWLADisplayStatsDiff(IPlugin):
self.cur_stats = {k:v for (k,v) in month_stats.get(self.month_stats_key, {}).items()}
return True
# from https://tutorialedge.net/python/removing-html-from-string/
def remove_tags(self, text):
return self.tag_re.sub('', text)
def hook(self):
display = self.iwla.getDisplay()
month_stats = self.iwla.getMonthStats()
@ -88,14 +95,21 @@ class IWLADisplayStatsDiff(IPlugin):
if new_value:
if self.stats_index != -1:
if new_value[self.stats_index] != v[self.stats_index]:
stats_diff[k] = 'iwla_update'
diff_value = v[self.stats_index] - new_value[self.stats_index]
stats_diff[k] = ['iwla_update', diff_value]
else:
if new_value != v:
stats_diff[k] = 'iwla_update'
diff_value = v - new_value
stats_diff[k] = ['iwla_update', diff_value]
else:
stats_diff[k] = 'iwla_new'
stats_diff[k] = ['iwla_new', 0]
for (idx, row) in enumerate(block.rows):
for k in stats_diff.keys():
if k in row[0]:
block.setCellCSSClass(idx, 0, stats_diff[k])
clear_text = self.remove_tags(row[0])
if clear_text in stats_diff.keys():
(cls, diff) = stats_diff[clear_text]
block.setCellCSSClass(idx, 0, cls)
if diff:
value = block.getCellValue(idx, self.display_index)
value += ' (+%d)' % diff
block.setCellValue(idx, self.display_index, value)

View File

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé 2018
# This file is part of iwla
# iwla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# iwla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with iwla. If not, see <http://www.gnu.org/licenses/>.
#
from iwla import IWLA
from istats_diff import IWLADisplayStatsDiff
from display import *
"""
Display hook
Enlight new and updated pages in in top_pages.html
Plugin requirements :
display/top_pages
Conf values needed :
None
Output files :
None
Statistics creation :
None
Statistics update :
None
Statistics deletion :
None
"""
class IWLADisplayTopPagesDiff(IWLADisplayStatsDiff):
def __init__(self, iwla):
super(IWLADisplayTopPagesDiff, self).__init__(iwla)
self.API_VERSION = 1
self.requires = ['IWLADisplayTopPages']
self.month_stats_key = u'top_pages'
self.filename = u'top_pages.html'
self.block_name = self.iwla._(u'All Pages')
def load(self):
if not self.iwla.getConfValue('create_all_pages_page', True):
return False
return super(IWLADisplayTopPagesDiff, self).load()

View File

@ -82,6 +82,8 @@ class IWLAPostAnalysisIPToGeo(IPlugin):
(_, cc) = self.iptogeo.ip_to_geo(ip)
cc = cc and cc or 'ip'
visitor['country_code'] = cc
if not self.iwla.isValidVisitor(visitor):
continue
if cc in geo.keys():
geo[cc] += 1
else:

View File

@ -140,30 +140,32 @@ class IWLAPostAnalysisReferers(IPlugin):
uri = r['extract_referer']['extract_uri']
if self.own_domain_re.match(uri): continue
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
elif super_hit['robot']:
if super_hit['robot']:
dictionary = robots_referers
# print '%s => %s' % (uri, super_hit['remote_ip'])
else:
dictionary = referers
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
not_engine = engine.get('not_search_engine', None)
# Try not engine
if not_engine and not_engine.match(uri): break
is_search_engine = True
uri = name
parameters = r['extract_referer'].get('extract_parameters', None)
key_phrase_re = engine.get('known_url', None)
self._extractKeyPhrase(key_phrase_re, parameters, key_phrases)
break
if is_search_engine:
dictionary = search_engine_referers
else:
dictionary = referers
if r['is_page']:
key = 'pages'
else:

View File

@ -109,6 +109,16 @@ class IWLAPreAnalysisRobots(IPlugin):
# continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 3) no pages and not hit --> robot
if not super_hit['viewed_hits'][0] and not super_hit['viewed_pages'][0]:
self._setRobot(k, super_hit)
continue
# 4) pages without hit --> robot
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
@ -116,15 +126,15 @@ class IWLAPreAnalysisRobots(IPlugin):
not_found_pages = 0
for hit in super_hit['requests']:
# 3) /robots.txt read
# 5) /robots.txt read
if hit['extract_request']['http_uri'].endswith('/robots.txt'):
self._setRobot(k, super_hit)
break
if int(hit['status']) == 404:
if int(hit['status']) == 404 or int(hit['status']) == 403:
not_found_pages += 1
# 4) Any referer for hits
# 6) Any referer for hits
if not hit['is_page'] and hit['http_referer']:
referers += 1
@ -132,7 +142,7 @@ class IWLAPreAnalysisRobots(IPlugin):
self._setRobot(k, super_hit)
continue
# 5) more than 10 404 pages
# 7) more than 10 404/403 pages
if not_found_pages > 10:
self._setRobot(k, super_hit)
continue