iwla

iwla Commit Details

Date:2017-08-24 07:55:53 (3 years 11 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:007be71ad65c902fbae297d2e947c0b8eae840cf
Parents: fffab335fa4ac053365c491162748e0daaa12e70
Message:New format for (not_)viewed pages/hits and bandwidth that are now recorded by day (in a dictionnary were only element 0 is initialized). Element 0 is the total. WARNING : not backward compatible with previous databases.

Changes:
Miwla.py (5 diffs)
Mplugins/display/all_visits.py (1 diff)
Mplugins/display/feeds.py (1 diff)
Mplugins/display/robot_bandwidth.py (1 diff)
Mplugins/display/top_visitors.py (2 diffs)
Mplugins/post_analysis/feeds.py (2 diffs)
Mplugins/pre_analysis/page_to_hit.py (2 diffs)
Mplugins/pre_analysis/robots.py (2 diffs)

File differences

iwla.py
9595
9696
9797
98
99
100
101
102
98
99
100
101
102
103103
104104
105105
......
298298
299299
300300
301
302
301
303302
304303
305304
......
318317
319318
320319
321
320
321
322
323
324
322325
323326
324327
......
336339
337340
338341
339
342
343
340344
341
345
346
342347
343348
344349
345350
346351
347
348
349
350
351
352
353
354
355
356
352357
353358
354359
......
659664
660665
661666
667
662668
663
669
664670
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
671
672
673
674
675
676
677
678
682679
683680
684681
remote_addr =>
remote_addr
remote_ip
viewed_pages
viewed_hits
not_viewed_pages
not_viewed_hits
bandwidth
viewed_pages{0..31} # 0 contains total
viewed_hits{0..31} # 0 contains total
not_viewed_pages{0..31}
not_viewed_hits{0..31}
bandwidth{0..31}
last_access
requests =>
[fields_from_format_log]
def isValidVisitor(self, hit):
if hit['robot']: return False
if not (conf.count_hit_only_visitors or\
hit['viewed_pages']):
if not conf.count_hit_only_visitors and not hit['viewed_pages'][0]:
return False
return True
# Don't keep all requests for robots
if not super_hit['robot']:
super_hit['requests'].append(hit)
super_hit['bandwidth'] += int(hit['body_bytes_sent'])
day = self.meta_infos['last_time'].tm_mday
if self.hasBeenViewed(hit):
super_hit['bandwidth'][day] = super_hit['bandwidth'].get(day, 0) + int(hit['body_bytes_sent'])
super_hit['bandwidth'][0] += int(hit['body_bytes_sent'])
super_hit['last_access'] = self.meta_infos['last_time']
request = hit['extract_request']
hit_key = 'viewed_hits'
if hit['is_page']:
super_hit[page_key] += 1
super_hit[page_key][day] = super_hit[page_key].get(day, 0) + 1
super_hit[page_key][0] += 1
else:
super_hit[hit_key] += 1
super_hit[hit_key][day] = super_hit[hit_key].get(day, 0) + 1
super_hit[hit_key][0] += 1
def _createVisitor(self, hit):
super_hit = self.current_analysis['visits'][hit['remote_addr']] = {}
super_hit['remote_addr'] = hit['remote_addr']
super_hit['remote_ip'] = hit['remote_addr']
super_hit['viewed_pages'] = 0
super_hit['viewed_hits'] = 0
super_hit['not_viewed_pages'] = 0
super_hit['not_viewed_hits'] = 0
super_hit['bandwidth'] = 0
super_hit['viewed_pages'] = {0:0}
super_hit['viewed_hits'] = {0:0}
super_hit['not_viewed_pages'] = {0:0}
super_hit['not_viewed_hits'] = {0:0}
super_hit['bandwidth'] = {0:0}
super_hit['last_access'] = self.meta_infos['last_time']
super_hit['requests'] = []
super_hit['robot'] = False
stats = self._createEmptyStats()
day = cur_time.tm_mday
for (k, super_hit) in visits.items():
if super_hit['last_access'].tm_mday != cur_time.tm_mday:
if super_hit['last_access'].tm_mday != day:
continue
viewed_pages = False
for hit in super_hit['requests'][::-1]:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
not self.hasBeenViewed(hit):
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
if hit['is_page']:
stats['viewed_pages'] += 1
viewed_pages = True
else:
stats['viewed_hits'] += 1
if (conf.count_hit_only_visitors or\
viewed_pages) and\
not super_hit['robot']:
if super_hit['robot']:
stats['not_viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
continue
stats['viewed_bandwidth'] += super_hit['bandwidth'].get(day, 0)
stats['viewed_hits'] += super_hit['viewed_hits'].get(day, 0)
stats['viewed_pages'] += super_hit['viewed_pages'].get(day, 0)
if ((conf.count_hit_only_visitors and super_hit['viewed_hits'].get(day, 0)) or\
super_hit['viewed_pages'].get(day, 0)):
stats['nb_visits'] += 1
self.logger.info("== Stats for %d/%02d/%02d ==" % (cur_time.tm_year, cur_time.tm_mon, cur_time.tm_mday))
plugins/display/all_visits.py
7878
7979
8080
81
82
83
81
82
83
8484
8585
8686
row = [
address,
super_hit['viewed_pages'],
super_hit['viewed_hits'],
bytesToStr(super_hit['bandwidth']),
super_hit['viewed_pages'][0],
super_hit['viewed_hits'][0],
bytesToStr(super_hit['bandwidth'][0]),
time.asctime(super_hit['last_access'])
]
table.appendRow(row)
plugins/display/feeds.py
8787
8888
8989
90
90
9191
92
92
9393
9494
9595
if super_hit['feed_parser'] == IWLAPostAnalysisFeeds.MERGED_FEED_PARSER:
address += '*'
if super_hit['robot']:
table.appendRow([address, super_hit['not_viewed_pages'], super_hit['not_viewed_hits']])
table.appendRow([address, super_hit['not_viewed_pages'][0], super_hit['not_viewed_hits'][0]])
else:
table.appendRow([address, super_hit['viewed_pages'], super_hit['viewed_hits']])
table.appendRow([address, super_hit['viewed_pages'][0], super_hit['viewed_hits'][0]])
page.appendBlock(table)
note = DisplayHTMLRaw(self.iwla, ('<small>*%s</small>' % (self.iwla._(u'Merged feeds parsers'))))
page.appendBlock(note)
plugins/display/robot_bandwidth.py
6868
6969
7070
71
71
7272
7373
7474
for (k, super_hit) in hits.items():
if not self.iwla.isRobot(super_hit):
continue
bandwidths.append((super_hit, super_hit['bandwidth']))
bandwidths.append((super_hit, super_hit['bandwidth'][0]))
bandwidths.sort(key=lambda tup: tup[1], reverse=True)
# All in a page
plugins/display/top_visitors.py
6060
6161
6262
63
64
65
63
64
65
6666
67
67
6868
6969
7070
......
7979
8080
8181
82
83
84
82
83
84
8585
8686
87
88
89
87
88
89
9090
9191
9292
total = [0]*5
for super_hit in hits.values():
total[1] += super_hit['viewed_pages']
total[2] += super_hit['viewed_hits']
total[3] += super_hit['bandwidth']
total[1] += super_hit['viewed_pages'][0]
total[2] += super_hit['viewed_hits'][0]
total[3] += super_hit['bandwidth'][0]
top_bandwidth = [(k,v['bandwidth']) for (k,v) in hits.items()]
top_bandwidth = [(k,v['bandwidth'][0]) for (k,v) in hits.items()]
top_bandwidth = sorted(top_bandwidth, key=lambda t: t[1], reverse=True)
top_visitors = [hits[h[0]] for h in top_bandwidth[:10]]
row = [
address,
super_hit['viewed_pages'],
super_hit['viewed_hits'],
bytesToStr(super_hit['bandwidth']),
super_hit['viewed_pages'][0],
super_hit['viewed_hits'][0],
bytesToStr(super_hit['bandwidth'][0]),
time.asctime(super_hit['last_access'])
]
total[1] -= super_hit['viewed_pages']
total[2] -= super_hit['viewed_hits']
total[3] -= super_hit['bandwidth']
total[1] -= super_hit['viewed_pages'][0]
total[2] -= super_hit['viewed_hits'][0]
total[3] -= super_hit['bandwidth'][0]
table.appendRow(row)
if total[1] or total[2] or total[3]:
total[0] = self.iwla._(u'Others')
plugins/post_analysis/feeds.py
7878
7979
8080
81
81
8282
8383
8484
......
117117
118118
119119
120
120
121121
122122
123123
return True
def mergeOneHitOnlyFeedsParsers(self, isFeedParser, one_hit_only, hit):
if isFeedParser and (hit['viewed_hits'] + hit['not_viewed_hits']) == 1:
if isFeedParser and (hit['viewed_hits'][0] + hit['not_viewed_hits'][0]) == 1:
user_agent = hit['requests'][0]['http_user_agent'].lower()
if one_hit_only.get(user_agent, None) is None:
# Merged
isFeedParser = self.FEED_PARSER
# Robot that views pages -> bot
if hit['robot']:
if hit['not_viewed_pages']:
if hit['not_viewed_pages'][0]:
isFeedParser = self.NOT_A_FEED_PARSER
break
if self.merge_one_hit_only_feeds_parsers:
plugins/pre_analysis/page_to_hit.py
8383
8484
8585
86
8687
8788
8889
8990
9091
9192
92
93
93
94
95
96
9497
9598
9699
......
98101
99102
100103
101
102
104
105
106
107
103108
uri = request['extract_request']['extract_uri']
day = request['time_decoded'].tm_mday
if request['is_page']:
# Page to hit
for regexp in self.ph_regexps:
if regexp.match(uri):
self.logger.debug('%s changed from page to hit' % (uri))
request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
super_hit['viewed_pages'][day] -= 1
super_hit['viewed_hits'][day] = super_hit['viewed_hits'].get(day, 0) + 1
super_hit['viewed_pages'][0] -= 1
super_hit['viewed_hits'][0] += 1
break
else:
# Hit to page
if regexp.match(uri):
self.logger.debug('%s changed from hit to page' % (uri))
request['is_page'] = True
super_hit['viewed_pages'] += 1
super_hit['viewed_hits'] -= 1
super_hit['viewed_pages'][day] = super_hit['viewed_pages'].get(day, 0) + 1
super_hit['viewed_hits'][day] -= 1
super_hit['viewed_pages'][0] += 1
super_hit['viewed_hits'][0] -= 1
break
plugins/pre_analysis/robots.py
104104
105105
106106
107
107
108108
109109
110110
111111
112
112
113113
114114
115115
......
137137
138138
139139
140
141
140
141
142142
143143
continue
# 1) no pages view --> robot
# if not super_hit['viewed_pages']:
# if not super_hit['viewed_pages'][0]:
# super_hit['robot'] = 1
# continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
if not super_hit['viewed_hits'][0] and super_hit['viewed_pages'][0]:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
continue
self._setRobot(k, super_hit)
continue
if not super_hit['viewed_pages'] and \
(super_hit['viewed_hits'] and not referers):
if not super_hit['viewed_pages'][0] and \
(super_hit['viewed_hits'][0] and not referers):
self._setRobot(k, super_hit)
continue

Archive Download the corresponding diff file

Branches

Tags