iwla

iwla Commit Details

Date:2014-12-14 15:10:13 (6 years 7 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:3a246d5cd6aa07418313b5b47879acb07e199a52
Parents: 9da4eb385820683e6a54a9e6e5b8031157d4dd94
Message:Optimize analysis using reverse loop

Changes:
Miwla.py (2 diffs)
Mplugins/post_analysis/referers.py (1 diff)
Mplugins/post_analysis/top_downloads.py (1 diff)
Mplugins/post_analysis/top_hits.py (1 diff)
Mplugins/post_analysis/top_pages.py (1 diff)
Mplugins/pre_analysis/page_to_hit.py (1 diff)

File differences

iwla.py
256256
257257
258258
259
259
260260
261261
262262
......
523523
524524
525525
526
526
527527
528528
529529
hit['is_page'] = self.isPage(uri)
if super_hit['robot'] or\
not int(hit['status']) in conf.viewed_http_codes:
not self.hasBeenViewed(hit):
page_key = 'not_viewed_pages'
hit_key = 'not_viewed_hits'
else:
if hit['time_decoded'].tm_mday != cur_time.tm_mday:
break
if super_hit['robot'] or\
not int(hit['status']) in conf.viewed_http_codes:
not self.hasBeenViewed(hit):
stats['not_viewed_bandwidth'] += int(hit['body_bytes_sent'])
continue
stats['viewed_bandwidth'] += int(hit['body_bytes_sent'])
plugins/post_analysis/referers.py
109109
110110
111111
112
113
112
113
114114
115115
116116
117
118
119117
120118
119
121120
122121
123122
key_phrases = month_stats.get('key_phrases', {})
for (k, super_hit) in stats.items():
for r in super_hit['requests']:
if not self.iwla.isValidForCurrentAnalysis(r): continue
for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r): break
if not r['http_referer']: continue
uri = r['extract_referer']['extract_uri']
is_search_engine = False
if self.own_domain_re.match(uri): continue
is_search_engine = False
for (name, engine) in self.search_engines.items():
for (hashid, hashid_re) in engine['hashid']:
if not hashid_re.match(uri): continue
plugins/post_analysis/top_downloads.py
4646
4747
4848
49
50
51
49
50
51
52
53
5254
53
54
55
56
5755
5856
5957
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
for r in super_hit['requests']:
if not self.iwla.isValidForCurrentAnalysis(r) or\
not self.iwla.hasBeenViewed(r):
for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r):
break
if not self.iwla.hasBeenViewed(r) or\
r['is_page']:
continue
if r['is_page']: continue
if not int(r['status']) in viewed_http_codes: continue
uri = r['extract_request']['extract_uri'].lower()
plugins/post_analysis/top_hits.py
4040
4141
4242
43
44
45
46
47
43
44
45
46
47
4848
4949
50
51
50
5251
5352
5453
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
for r in super_hit['requests']:
if r['is_page']: continue
if not self.iwla.isValidForCurrentAnalysis(r) or\
not self.iwla.hasBeenViewed(r):
for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r):
break
if not self.iwla.hasBeenViewed(r) or\
r['is_page']:
continue
uri = r['extract_request']['extract_uri']
uri = r['extract_request']['extract_uri'].lower()
uri = "%s%s" % (r.get('server_name', ''), uri)
if not uri in top_hits.keys():
plugins/post_analysis/top_pages.py
4646
4747
4848
49
50
51
52
53
49
50
51
52
53
5454
5555
5656
for (k, super_hit) in stats.items():
if super_hit['robot']: continue
for r in super_hit['requests']:
if not r['is_page']: continue
if not self.iwla.isValidForCurrentAnalysis(r) or\
not self.iwla.hasBeenViewed(r):
for r in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(r):
break
if not self.iwla.hasBeenViewed(r) or\
not r['is_page']:
continue
uri = r['extract_request']['extract_uri']
plugins/pre_analysis/page_to_hit.py
5454
5555
5656
57
58
59
57
58
59
60
61
6062
6163
6264
for (k, super_hit) in hits.items():
if super_hit['robot']: continue
for request in super_hit['requests']:
if not self.iwla.isValidForCurrentAnalysis(request) or\
not self.iwla.hasBeenViewed(request):
for request in super_hit['requests'][::-1]:
if not self.iwla.isValidForCurrentAnalysis(request):
break
if not self.iwla.hasBeenViewed(request):
continue
uri = request['extract_request']['extract_uri']

Archive Download the corresponding diff file

Branches

Tags