Merge branch 'master' of soutade.fr:iwla

Conflicts:
	conf.py
This commit is contained in:
Gregory Soutade 2014-12-14 15:15:05 +01:00
commit bd31b04e9b
7 changed files with 77 additions and 83 deletions

14
TODO Normal file
View File

@ -0,0 +1,14 @@
reverse analysis
-f option to read a file instead of the one in conf
Other when pages truncated
translations
doc auto generation
doc enhancement
Limit hits/pages/downloads by rate
Automatic tests
Test separate directory for DB and display
quiet mode
Add 0 before month when < 10
Add Licence
Free memory as soon as possible
Bug in bandwidth account (x10)

90
iwla.py
View File

@ -174,7 +174,7 @@ class IWLA(object):
def getCurDisplayPath(self, filename): def getCurDisplayPath(self, filename):
cur_time = self.meta_infos['last_time'] cur_time = self.meta_infos['last_time']
return os.path.join(str(cur_time.tm_year), str(cur_time.tm_mon), filename) return os.path.join(str(cur_time.tm_year), '%02d' % (cur_time.tm_mon), filename)
def getResourcesPath(self): def getResourcesPath(self):
return conf.resources_path return conf.resources_path
@ -194,7 +194,7 @@ class IWLA(object):
return self.display return self.display
def getDBFilename(self, time): def getDBFilename(self, time):
return os.path.join(conf.DB_ROOT, str(time.tm_year), str(time.tm_mon), conf.DB_FILENAME) return os.path.join(conf.DB_ROOT, str(time.tm_year), '%02d' % (time.tm_mon), conf.DB_FILENAME)
def _serialize(self, obj, filename): def _serialize(self, obj, filename):
base = os.path.dirname(filename) base = os.path.dirname(filename)
@ -255,12 +255,8 @@ class IWLA(object):
hit['is_page'] = self.isPage(uri) hit['is_page'] = self.isPage(uri)
status = int(hit['status'])
if status not in conf.viewed_http_codes:
return
if super_hit['robot'] or\ if super_hit['robot'] or\
not status in conf.viewed_http_codes: not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages' page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits' hit_key = 'not_viewed_hits'
else: else:
@ -336,7 +332,7 @@ class IWLA(object):
def _generateDisplayDaysStats(self): def _generateDisplayDaysStats(self):
cur_time = self.meta_infos['last_time'] cur_time = self.meta_infos['last_time']
title = 'Stats %d/%d' % (cur_time.tm_mon, cur_time.tm_year) title = 'Stats %d/%02d' % (cur_time.tm_year, cur_time.tm_mon)
filename = self.getCurDisplayPath('index.html') filename = self.getCurDisplayPath('index.html')
print '==> Generate display (%s)' % (filename) print '==> Generate display (%s)' % (filename)
page = self.display.createPage(title, filename, conf.css_path) page = self.display.createPage(title, filename, conf.css_path)
@ -405,7 +401,7 @@ class IWLA(object):
full_month = '%s %d' % (months_name[i], year) full_month = '%s %d' % (months_name[i], year)
if i in month_stats.keys(): if i in month_stats.keys():
stats = month_stats[i] stats = month_stats[i]
link = '<a href="%d/%d/index.html">Details</a>' % (year, i) link = '<a href="%d/%02d/index.html">Details</a>' % (year, i)
row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'], row = [full_month, stats['nb_visitors'], stats['viewed_pages'], stats['viewed_hits'],
stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link] stats['viewed_bandwidth'], stats['not_viewed_bandwidth'], link]
for j in graph_cols: for j in graph_cols:
@ -449,34 +445,14 @@ class IWLA(object):
self._generateDisplayWholeMonthStats() self._generateDisplayWholeMonthStats()
self.display.build(conf.DISPLAY_ROOT) self.display.build(conf.DISPLAY_ROOT)
def _generateStats(self, visits): def _createEmptyStats(self):
stats = {} stats = {}
stats['viewed_bandwidth'] = 0 stats['viewed_bandwidth'] = 0
stats['not_viewed_bandwidth'] = 0 stats['not_viewed_bandwidth'] = 0
stats['viewed_pages'] = 0 stats['viewed_pages'] = 0
stats['viewed_hits'] = 0 stats['viewed_hits'] = 0
#stats['requests'] = set()
stats['nb_visitors'] = 0 stats['nb_visitors'] = 0
for (k, super_hit) in visits.items():
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth']
continue
#print "[%s] =>\t%d/%d" % (k, super_hit['viewed_pages'], super_hit['viewed_hits'])
if conf.count_hit_only_visitors or\
super_hit['viewed_pages']:
stats['nb_visitors'] += 1
stats['viewed_bandwidth'] += super_hit['bandwidth']
stats['viewed_pages'] += super_hit['viewed_pages']
stats['viewed_hits'] += super_hit['viewed_hits']
# for p in super_hit['requests']:
# if not p['is_page']: continue
# req = p['extract_request']
# stats['requests'].add(req['extract_uri'])
return stats return stats
def _generateMonthStats(self): def _generateMonthStats(self):
@ -484,11 +460,15 @@ class IWLA(object):
visits = self.current_analysis['visits'] visits = self.current_analysis['visits']
stats = self._generateStats(visits) stats = self._createEmptyStats()
for (day, stat) in self.current_analysis['days_stats'].items():
for k in stats.keys():
stats[k] += stat[k]
duplicated_stats = {k:v for (k,v) in stats.items()} duplicated_stats = {k:v for (k,v) in stats.items()}
cur_time = self.meta_infos['last_time'] cur_time = self.meta_infos['last_time']
print "== Stats for %d/%d ==" % (cur_time.tm_year, cur_time.tm_mon) print "== Stats for %d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon)
print stats print stats
if not 'month_stats' in self.current_analysis.keys(): if not 'month_stats' in self.current_analysis.keys():
@ -514,7 +494,6 @@ class IWLA(object):
os.remove(path) os.remove(path)
print "==> Serialize to %s" % path print "==> Serialize to %s" % path
self._serialize(self.current_analysis, path) self._serialize(self.current_analysis, path)
# Save month stats # Save month stats
@ -530,31 +509,35 @@ class IWLA(object):
def _generateDayStats(self): def _generateDayStats(self):
visits = self.current_analysis['visits'] visits = self.current_analysis['visits']
cur_time = self.meta_infos['last_time']
self._callPlugins(conf.PRE_HOOK_DIRECTORY) self._callPlugins(conf.PRE_HOOK_DIRECTORY)
stats = self._generateStats(visits) stats = self._createEmptyStats()
cur_time = self.meta_infos['last_time'] for (k, super_hit) in visits.items():
print "== Stats for %d/%d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday) if super_hit['last_access'].tm_mday != cur_time.tm_mday:
if cur_time.tm_mday > 1:
last_day = cur_time.tm_mday - 1
while last_day:
if last_day in self.current_analysis['days_stats'].keys():
break
last_day -= 1
if last_day:
for k in stats.keys():
stats[k] -= self.current_analysis['days_stats'][last_day][k]
stats['nb_visitors'] = 0
for (k,v) in visits.items():
if v['robot']: continue
if conf.count_hit_only_visitors and\
(not v['viewed_pages']):
continue continue
if v['last_access'].tm_mday == cur_time.tm_mday: viewed_page = False
for hit in super_hit['requests'][::-1]:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages):
stats['nb_visitors'] += 1 stats['nb_visitors'] += 1
print "== Stats for %d/%02d/%d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday)
print stats print stats
self.current_analysis['days_stats'][cur_time.tm_mday] = stats self.current_analysis['days_stats'][cur_time.tm_mday] = stats
@ -568,10 +551,9 @@ class IWLA(object):
self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits() self.current_analysis = self._deserialize(self.getDBFilename(t)) or self._clearVisits()
self.analyse_started = True self.analyse_started = True
else: else:
if not self.analyse_started:
if time.mktime(t) < time.mktime(cur_time): if time.mktime(t) < time.mktime(cur_time):
return False return False
else: if not self.analyse_started:
self.analyse_started = True self.analyse_started = True
if cur_time.tm_mon != t.tm_mon: if cur_time.tm_mon != t.tm_mon:
self._generateMonthStats() self._generateMonthStats()

View File

@ -109,15 +109,14 @@ class IWLAPostAnalysisReferers(IPlugin):
key_phrases = month_stats.get('key_phrases', {}) key_phrases = month_stats.get('key_phrases', {})
for (k, super_hit) in stats.items(): for (k, super_hit) in stats.items():
for r in super_hit['requests']: for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r): continue if not self.iwla.isValidForCurrentAnalysis(r): break
if not r['http_referer']: continue if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri'] uri = r['extract_referer']['extract_uri']
is_search_engine = False
if self.own_domain_re.match(uri): continue if self.own_domain_re.match(uri): continue
is_search_engine = False
for (name, engine) in self.search_engines.items(): for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']: for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue if not hashid_re.match(uri): continue

View File

@ -46,14 +46,12 @@ class IWLAPostAnalysisTopDownloads(IPlugin):
for (k, super_hit) in stats.items(): for (k, super_hit) in stats.items():
if super_hit['robot']: continue if super_hit['robot']: continue
for r in super_hit['requests']: for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r) or\ if not self.iwla.isValidForCurrentAnalysis(r):
not self.iwla.hasBeenViewed(r): break
if not self.iwla.hasBeenViewed(r) or\
r['is_page']:
continue continue
if r['is_page']: continue
if not int(r['status']) in viewed_http_codes: continue
uri = r['extract_request']['extract_uri'].lower() uri = r['extract_request']['extract_uri'].lower()

View File

@ -40,15 +40,14 @@ class IWLAPostAnalysisTopHits(IPlugin):
for (k, super_hit) in stats.items(): for (k, super_hit) in stats.items():
if super_hit['robot']: continue if super_hit['robot']: continue
for r in super_hit['requests']: for r in super_hit['requests'][::-1]:
if r['is_page']: continue if not self.iwla.isValidForCurrentAnalysis(r):
break
if not self.iwla.isValidForCurrentAnalysis(r) or\ if not self.iwla.hasBeenViewed(r) or\
not self.iwla.hasBeenViewed(r): r['is_page']:
continue continue
uri = r['extract_request']['extract_uri'] uri = r['extract_request']['extract_uri'].lower()
uri = "%s%s" % (r.get('server_name', ''), uri) uri = "%s%s" % (r.get('server_name', ''), uri)
if not uri in top_hits.keys(): if not uri in top_hits.keys():

View File

@ -46,11 +46,11 @@ class IWLAPostAnalysisTopPages(IPlugin):
for (k, super_hit) in stats.items(): for (k, super_hit) in stats.items():
if super_hit['robot']: continue if super_hit['robot']: continue
for r in super_hit['requests']: for r in super_hit['requests'][::-1]:
if not r['is_page']: continue if not self.iwla.isValidForCurrentAnalysis(r):
break
if not self.iwla.isValidForCurrentAnalysis(r) or\ if not self.iwla.hasBeenViewed(r) or\
not self.iwla.hasBeenViewed(r): not r['is_page']:
continue continue
uri = r['extract_request']['extract_uri'] uri = r['extract_request']['extract_uri']

View File

@ -54,9 +54,11 @@ class IWLAPreAnalysisPageToHit(IPlugin):
for (k, super_hit) in hits.items(): for (k, super_hit) in hits.items():
if super_hit['robot']: continue if super_hit['robot']: continue
for request in super_hit['requests']: for request in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(request) or\ if not self.iwla.isValidForCurrentAnalysis(request):
not self.iwla.hasBeenViewed(request): break
if not self.iwla.hasBeenViewed(request):
continue continue
uri = request['extract_request']['extract_uri'] uri = request['extract_request']['extract_uri']