iwla

iwla Commit Details

Date:2015-05-22 07:51:11 (6 years 2 months ago)
Author:Grégory Soutadé
Branch:dev, master
Commit:4cb3b21ca5a648b717e6e6f8dc261b774af0ca8b
Parents: 86fc5f2189c07bf34c7d7e1a4885bf68c4609d07
Message:Add reset feature Allow to open .gz file transparently Import debug in robots.py

Changes:
Miwla.py (4 diffs)
Mplugins/pre_analysis/page_to_hit.py (4 diffs)
Mplugins/pre_analysis/robots.py (5 diffs)

File differences

iwla.py
683683
684684
685685
686
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
687712
688713
689714
690715
691716
692717
718
719
693720
694721
695722
......
743770
744771
745772
746
773
774
775
776
777
747778
748779
749780
......
770801
771802
772803
804
805
806
773807
774808
775809
......
804838
805839
806840
807
841
808842
809843
810
844
return True
def start(self, _file):
def _reset(self):
reset_time = time.strptime(self.args.reset, '%m/%Y')
self.logger.info('Reset time')
self.logger.info(reset_time)
self.meta_infos['last_time'] = reset_time
cur_time = time.localtime()
year = reset_time.tm_year
while year < cur_time.tm_year:
db_path = os.path.join(conf.DB_ROOT, str(year))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year))
if os.path.exists(output_path): shutil.rmtree(output_path)
year += 1
month = reset_time.tm_mon
while month <= cur_time.tm_mon:
db_path = os.path.join(conf.DB_ROOT, str(year), '%02d' % (month))
if os.path.exists(db_path): shutil.rmtree(db_path)
output_path = os.path.join(conf.DISPLAY_ROOT, str(year), '%02d' % (month))
if os.path.exists(output_path): shutil.rmtree(output_path)
month += 1
def start(self, _file, args):
self.args = args
self.start_time = datetime.now()
self.logger.info('==> Load previous database')
self.meta_infos = self._deserialize(conf.META_PATH) or self._clearMeta()
if self.meta_infos['last_time']:
if args.reset:
self._reset()
self.logger.info('Last time')
self.logger.info(self.meta_infos['last_time'])
self.current_analysis = self._deserialize(self.getDBFilename(self.meta_infos['last_time'])) or self._clearVisits()
self.cur_file = None
if not self.filenames:
raise StopIteration()
self.cur_file = open(self.filenames.pop(0))
filename = self.filenames.pop(0)
if filename.endswith('gz'):
self.cur_file = gzip.open(filename, 'r')
else:
self.cur_file = open(filename)
def next(self):
l = self.cur_file.readline()
default='INFO', type=str,
help='Loglevel in %s, default : %s' % (['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 'INFO'))
parser.add_argument('-r', '--reset', dest='reset',
help='Reset analysis to a specific date (month/year)')
args = parser.parse_args()
# Load user conf
sys.exit(0)
if args.stdin:
iwla.start(sys.stdin)
iwla.start(sys.stdin, args)
else:
filename = args.file or conf.analyzed_filename
iwla.start(FileIter(filename))
iwla.start(FileIter(filename), args)
plugins/pre_analysis/page_to_hit.py
1919
2020
2121
22
2223
2324
2425
......
6465
6566
6667
68
6769
6870
6971
......
8587
8688
8789
88
90
8991
9092
9193
......
9496
9597
9698
97
99
98100
99101
100102
#
import re
import logging
from iwla import IWLA
from iplugin import IPlugin
self.hp_regexps = self.iwla.getConfValue('hit_to_page_conf', [])
self.hp_regexps = map(lambda(r): re.compile(r), self.hp_regexps)
self.logger = logging.getLogger(self.__class__.__name__)
return True
def hook(self):
# Page to hit
for regexp in self.ph_regexps:
if regexp.match(uri):
#print '%s is a hit' % (uri )
self.logger.debug('%s changed from page to hit' % (uri))
request['is_page'] = False
super_hit['viewed_pages'] -= 1
super_hit['viewed_hits'] += 1
# Hit to page
for regexp in self.hp_regexps:
if regexp.match(uri):
#print '%s is a page' % (uri )
self.logger.debug('%s changed from hit to page' % (uri))
request['is_page'] = True
super_hit['viewed_pages'] += 1
super_hit['viewed_hits'] -= 1
plugins/pre_analysis/robots.py
2020
2121
2222
23
2324
2425
2526
......
6667
6768
6869
69
70
71
72
73
74
7075
7176
7277
......
8489
8590
8691
92
8793
8894
8995
......
9399
94100
95101
102
96103
97104
98105
......
103110
104111
105112
113
106114
107115
108116
import re
import logging
import inspect
from iwla import IWLA
from iplugin import IPlugin
return True
def _setRobot(self, k, super_hit):
self.logger.debug('%s is a robot' % (k))
callerframerecord = inspect.stack()[1]
frame = callerframerecord[0]
info = inspect.getframeinfo(frame)
self.logger.debug('%s is a robot (caller %s:%d)' % (k, info.function, info.lineno))
super_hit['robot'] = 1
# Basic rule to detect robots
if self.robot_re.match(first_page['http_user_agent']) or\
self.crawl_re.match(first_page['http_user_agent']):
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
break
if isRobot:
self.logger.debug(first_page['http_user_agent'])
self._setRobot(k, super_hit)
continue
# 2) pages without hit --> robot
if not super_hit['viewed_hits']:
self.logger.debug(super_hit)
self._setRobot(k, super_hit)
continue

Archive Download the corresponding diff file

Branches

Tags