# -*- coding: utf-8 -*-␊ |
"""␊ |
Copyright 2016 Grégory Soutadé␊ |
␊ |
This file is part of Dénote.␊ |
␊ |
Dynastie is free software: you can redistribute it and/or modify␊ |
it under the terms of the GNU General Public License as published by␊ |
the Free Software Foundation, either version 3 of the License, or␊ |
(at your option) any later version.␊ |
␊ |
Dynastie is distributed in the hope that it will be useful,␊ |
but WITHOUT ANY WARRANTY; without even the implied warranty of␊ |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the␊ |
GNU General Public License for more details.␊ |
␊ |
You should have received a copy of the GNU General Public License␊ |
along with Dynastie. If not, see <http://www.gnu.org/licenses/>.␊ |
"""␊ |
import re␊ |
import unicodedata␊ |
import os␊ |
import operator␊ |
import pickle␊ |
from django.db import models␊ |
␊ |
import models␊ |
#from models import Note␊ |
␊ |
class Search:␊ |
MINIMUM_LETTERS = 3␊ |
␊ |
def __init__(self):␊ |
self.report = ''␊ |
␊ |
self.tagreg = re.compile('<[^>]+>')␊ |
self.htmlreg = re.compile('&[^;]+;')␊ |
self.numreg = re.compile('[0-9]+')␊ |
self.pat = re.compile(r'\s+')␊ |
␊ |
self.replace_by_space = (u'(', u')', u'#', u'\'', u'{', u'}', u'[', u']',␊ |
u'-', u'|', u'\t', u'\\', u'_', u'^' '=', u'+', u'$',␊ |
u'£', u'%', u'µ', u'*', u',', u'?', u';', u'.', u'/',␊ |
u':', u'!', u'§', u'€', u'²')␊ |
␊ |
# Imported from generator.py␊ |
def _addReport(self, string, color=''):␊ |
if color != '':␊ |
self.report = self.report + '<span style="color:' + color + '">'␊ |
self.report = self.report + '<b>' + self.__class__.__name__ + '</b> : '␊ |
self.report = self.report + string␊ |
if color != '':␊ |
self.report = self.report + '</span>'␊ |
self.report = self.report + '<br/>\n'␊ |
␊ |
def _addWarning(self, string):␊ |
self.addReport(string, 'yellow')␊ |
␊ |
def _addError(self, string):␊ |
self.addReport(string, 'red')␊ |
␊ |
␊ |
def _saveDatabase(self, hashtable):␊ |
d = pickle.dumps(hashtable)␊ |
␊ |
f = open(os.environ['DENOTE_ROOT'] + '/_search.db', 'w')␊ |
f.write(d)␊ |
f.close()␊ |
␊ |
def _loadDatabase(self):␊ |
filename = os.environ['DENOTE_ROOT'] + '/_search.db'␊ |
␊ |
if not os.path.exists(filename):␊ |
print 'No search index !'␊ |
return {}␊ |
␊ |
f = open(filename, 'rb')␊ |
hashtable = pickle.load(f)␊ |
f.close()␊ |
␊ |
return hashtable␊ |
␊ |
def _strip_accents(self, s):␊ |
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))␊ |
␊ |
def _remove_tag(self, content):␊ |
content = self.htmlreg.sub('', content)␊ |
content = self.numreg.sub('', content)␊ |
␊ |
content = content.replace('\n', '')␊ |
content = content.replace('\r', '')␊ |
content = content.replace('"', '')␊ |
␊ |
for c in self.replace_by_space:␊ |
content = content.replace(c, ' ')␊ |
␊ |
content = self.tagreg.sub('', content)␊ |
␊ |
content = self.pat.sub(' ', content)␊ |
␊ |
return content␊ |
␊ |
def _prepare_string(self, content):␊ |
content = self._remove_tag(content)␊ |
content = self._strip_accents(content)␊ |
␊ |
return content␊ |
␊ |
def _indexContent(self, hashtable, index, content, word_weight):␊ |
content = self._prepare_string(content)␊ |
␊ |
wordlist = content.split(' ')␊ |
␊ |
for word in wordlist:␊ |
if len(word) < self.MINIMUM_LETTERS:␊ |
continue␊ |
word = word.lower()␊ |
if not word in hashtable:␊ |
hashtable[word] = []␊ |
if not index in hashtable[word]:␊ |
hashtable[word].insert(0, [index, word_weight])␊ |
else:␊ |
weight = hashtable[word][1]␊ |
hashtable[word][1] = weight + word_weight␊ |
␊ |
def _index(self, hashtable, index):␊ |
try:␊ |
note = Note.objects.get(pk=index)␊ |
except:␊ |
return␊ |
␊ |
self._indexContent(hashtable, index, note.text, 1)␊ |
self._indexContent(hashtable, index, note.title.encode('utf-8'), 5)␊ |
␊ |
def _index_note(self, note, saveDatabase=True):␊ |
hashtable = self._loadDatabase()␊ |
␊ |
self._index(hashtable, int(note))␊ |
␊ |
if saveDatabase:␊ |
self._saveDatabase(hashtable)␊ |
␊ |
def _remove_note(self, note, saveDatabase=True):␊ |
hashtable = self._loadDatabase()␊ |
␊ |
if hashtable is None: return␊ |
␊ |
for k, v in hashtable.items():␊ |
# For tuples in values␊ |
for t in v:␊ |
if note == v[0]:␊ |
v.remove(t)␊ |
␊ |
if saveDatabase:␊ |
self._saveDatabase(hashtable)␊ |
␊ |
def generate_index(self, notes):␊ |
hashtable = self._loadDatabase()␊ |
␊ |
for note in notes:␊ |
self._indexContent(hashtable, note.id, note.text, 1)␊ |
self._indexContent(hashtable, note.id, note.title, 5)␊ |
␊ |
self._saveDatabase(hashtable)␊ |
␊ |
def index_note(self, note):␊ |
return self._index_note(note, True)␊ |
␊ |
def delete_note(self, note):␊ |
return self._remove_note(note, True)␊ |
␊ |
def edit_note(self, note, saveDatabase=True):␊ |
self._remove_note(note, False)␊ |
self._index_note(note, True)␊ |
␊ |
def search(self, string):␊ |
hashtable = self._loadDatabase()␊ |
␊ |
string = self._prepare_string(string.encode('utf-8'))␊ |
␊ |
wordlist = string.split(' ')␊ |
␊ |
res = {}␊ |
for word in wordlist:␊ |
if len(word) < Search.MINIMUM_LETTERS:␊ |
continue␊ |
word = word.lower()␊ |
reg = re.compile('.*' + word + '.*')␊ |
for key in hashtable.keys():␊ |
if reg.match(key):␊ |
for note in hashtable[key]:␊ |
res[note[0]] = res.get(note[0],0) + note[1]␊ |
␊ |
sorted_res = sorted(res.iteritems(), key=operator.itemgetter(1))␊ |
sorted_res.reverse()␊ |
␊ |
res = [sorted_res[i][0] for i in range(len(sorted_res))]␊ |
␊ |
return res␊ |