Photorec Stage 2

Photorec Stage 2 Commit Details

Date:2016-09-12 19:37:14 (3 years 10 months ago)
Author:Grégory Soutadé
Branch:master
Commit:26c784e59fff8641141c08a47e193473d71b5e23
Message:Initial commit

Changes:
Aphotorec_stage_2.py (full)

File differences

photorec_stage_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright 2016 Grégory Soutadé
This file is part of Photorec Stage 2.
Photorec Stage 2 is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Photorec Stage 2 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Photorec Stage 2. If not, see <http://www.gnu.org/licenses/>.
"""
import os
from zipfile import ZipFile
import argparse
import shutil
from datetime import datetime
OOO_CREATION_DATE_START = '<meta:creation-date>'
OOO_CREATION_DATE_END = '</meta:creation-date>'
MS_CREATION_DATE_START = '<dcterms:created xsi:type="dcterms:W3CDTF">'
MS_CREATION_DATE_END = '</dcterms:created>'
OPEN_OFFICE_MIME_TYPES = {
'application/vnd.oasis.opendocument.text' : 'odt',
'application/vnd.oasis.opendocument.spreadsheet' : 'ods',
'application/vnd.oasis.opendocument.presentation' : 'odp',
}
statistics = {}
def remove_extension(filename):
point_pos = filename.rfind('.')
if point_pos == -1:
return (filename, '')
extension = filename[point_pos+1:]
filename = filename[:point_pos]
return filename, extension
def copy_file(orig_filepath, filename, outdir, verbose, extension=None):
if not os.path.exists(outdir):
if verbose:
print('mkdirs %s' % (outdir))
os.makedirs(outdir)
if not extension:
filename, extension = remove_extension(filename)
if os.path.exists(os.path.join(outdir, '%s.%s' % (filename, extension))):
cur_num = 2
while os.path.exists(os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))):
cur_num = cur_num + 1
filename = os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))
else:
filename = os.path.join(outdir, '%s.%s' % (filename, extension))
if verbose:
print('\tCopy %s => %s' % (orig_filepath, filename))
statistics[extension] = statistics.get(extension, 0) + 1
shutil.copy(orig_filepath, filename)
def copy_datetime_file(dt, orig_filepath, outdir, verbose, extension):
outdir = os.path.join(outdir, str(dt.year), '%02d' % dt.month)
filename = '%d_%02d_%02d' % (dt.year, dt.month, dt.day)
return copy_file(orig_filepath, filename, outdir, verbose, extension)
def _try_open_office(orig_filepath, zipfile, zipname, filename, outdir, verbose):
with zipfile.open(zipname) as mimetype:
mime = mimetype.read()
if not mime in OPEN_OFFICE_MIME_TYPES.keys(): return False
ext = OPEN_OFFICE_MIME_TYPES[mime]
if verbose:
print('Found %s file' % (ext))
try:
meta = zipfile.open('meta.xml')
except KeyError:
return False
outdir = os.path.join(outdir, 'office')
metadata = meta.read()
try:
start = metadata.index(OOO_CREATION_DATE_START)
end = metadata.index(OOO_CREATION_DATE_END)
creation_date = datetime.strptime(metadata[start+len(OOO_CREATION_DATE_START):end], '%Y-%m-%dT%H:%M:%S')
copy_datetime_file(creation_date, orig_filepath, outdir, verbose, ext)
except:
copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, ext)
meta.close()
return True
return False
def _try_ms_office(orig_filepath, zipfile, filename, outdir, verbose, extension):
try:
meta = zipfile.open('docProps/core.xml')
except KeyError:
return False
outdir = os.path.join(outdir, 'office')
metadata = meta.read()
try:
start = metadata.index(MS_CREATION_DATE_START)
end = metadata.index(MS_CREATION_DATE_END)
creation_date = datetime.strptime(metadata[start+len(MS_CREATION_DATE_START):end][:19], '%Y-%m-%dT%H:%M:%S')
copy_datetime_file(creation_date, orig_filepath, outdir, verbose, extension)
except:
copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, extension)
meta.close()
return True
def manage_zip(orig_filepath, filename, extension, outdir, verbose=False):
ret = False
with ZipFile(orig_filepath, 'r') as myzip:
for name in myzip.namelist():
# Maybe an Open Document file
if name == 'mimetype':
ret = _try_open_office(orig_filepath, myzip, name, filename, outdir, verbose)
if ret: return ret
if name.startswith('word/'):
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'docx')
if ret: return ret
if name.startswith('xl/'):
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'xslx')
if ret: return ret
if name.startswith('ppt/'):
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'pptx')
if ret: return ret
return ret
def _escape_name(name):
name = name.replace('/', '')
name = name.replace('\\', '')
return name
def _append_tag(filename, tag):
if not tag: return filename
if filename:
filename = '%s - ' % filename
if type(tag) == int:
filename ='%s%d' % (filename, tag)
else:
filename ='%s%s' % (filename, _escape_name(tag))
return filename
def manage_audio(orig_filepath, filename, extension, outdir, verbose=False):
tags = None
try:
tags = eyed3.load(orig_filepath).tag
except:
return False
if not tags or (not tags.artist and not tags.album and not tags.title):
return False
outdir = os.path.join(outdir, 'multimedia', 'audio')
new_filename = ''
if tags.track_num[0]: new_filename = _append_tag('', tags.track_num[0])
new_filename = _append_tag(new_filename, tags.title)
new_filename = '%s.%s' % (new_filename, extension)
subdir = False
if tags.artist:
outdir = os.path.join(outdir, _escape_name(tags.artist))
subdir = True
if tags.album:
outdir = os.path.join(outdir, _escape_name(tags.album))
subdir = True
if not subdir:
outdir = os.path.join(outdir, 'unknown')
copy_file(orig_filepath, new_filename, outdir, verbose)
return True
def manage_picture(orig_filepath, filename, extension, outdir, verbose=False):
tags = None
with open(orig_filepath, 'rb') as f:
try:
tags = exifread.process_file(f)
except:
return False
outdir = os.path.join(outdir, 'multimedia', 'pictures')
dt = None
try:
dt = datetime.strptime(tags['EXIF DateTimeOriginal'].values, '%Y:%m:%d %H:%M:%S')
except KeyError, e: # No 'EXIF DateTimeOriginal'
return False
except:
print 'Invalid date format \'%s\'' % tags['EXIF DateTimeOriginal']
return False
copy_datetime_file(dt, orig_filepath, outdir, verbose, extension)
return True
_multiplier_bytes = (1, 1024, 1024*1024, 1024*1024*1024)
_multiplier_bits = (1, 1000, 1000*1000, 1000*1000*1000)
def parse_min_sizes(min_sizes_comma):
if not min_sizes_comma: return {}
min_sizes = {}
for element in min_sizes_comma.split(','):
sub_elements = element.split(':')
if len(sub_elements) != 2:
raise Exception('Invalid parameter --min-size \'%s\'' % (element))
extension = sub_elements[0]
size = sub_elements[1]
multiplier_idx = 0
multiplier_tab = _multiplier_bytes
if size[-1:] == 'b' or size[-1:] == 'B':
size = size[:-1]
if size[-1:] == 'i':
size = size[:-1]
multiplier_tab = _multiplier_bits
if size[-1:] == 'k' or size[-1:] == 'K':
multiplier_idx = 1
size = size[:-1]
elif size[-1:] == 'm' or size[-1:] == 'M':
multiplier_idx = 2
size = size[:-1]
elif size[-1:] == 'g' or size[-1:] == 'G':
multiplier_idx = 2
size = size[:-1]
try:
size = int(size)
except:
raise Exception('Invalid parameter --min-size \'%s\'' % (element))
min_sizes[extension] = size * multiplier_tab[multiplier_idx]
return min_sizes
parser = argparse.ArgumentParser(description='Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg).')
parser.add_argument('--in', dest='in_dir', help='Directory in (with photorec results)', required=True)
parser.add_argument('--out', dest='out_dir', help='Directory out (script results)', required=True)
parser.add_argument('--max-files-per-temp', dest='max_files_per_temp',
help='Maximum unknown files in temprorary directory, -1 for no temp dir',
type=int, default=50)
parser.add_argument('--skip-ext', dest='skip_ext', help='Don\'t copy some extensions (comma separated eg : mp3,txt,doc)', default='')
parser.add_argument('--only-ext', dest='only_ext', help='Copy some extensions (comma separated eg : mp3,txt,doc)', default='')
parser.add_argument('--min-size', dest='min_size', help='Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8)', default='')
parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode')
parser.add_argument('--quiet', dest='quiet', action='store_true', default=False, help='Quiet mode')
args = parser.parse_args()
file_ops = {
'zip': manage_zip,
}
try:
import eyed3
file_ops['mp3'] = manage_audio
except:
print('Package eyed3 not installed, mp3 format not supported. Use pip install eyed3')
try:
import exifread
file_ops['jpg'] = manage_picture
except:
print('Package exifread not installed, jpg format not supported. Use pip install exifread')
file_ops_keys = file_ops.keys()
cur_out_dir = 0
cur_files_in_out_dir = 0
skip_exts = args.skip_ext and args.skip_ext.split(',') or None
only_exts = args.only_ext and args.only_ext.split(',') or None
min_sizes = parse_min_sizes(args.min_size)
min_sizes_keys = min_sizes.keys()
# Disable (force) verbose on quiet
if args.quiet: args.verbose = False
if args.max_files_per_temp == -1:
outdir = args.out_dir
for root, dirs, files in os.walk(args.in_dir):
for filename in files:
full_path = os.path.join(root, filename)
_, cur_extension = remove_extension(filename)
# Only some extensions
if only_exts and cur_extension not in only_exts:
if args.verbose:
print('Skipping %s (only extension)' % (full_path))
continue
# Skipping some extensions
if skip_exts and cur_extension in skip_exts:
if args.verbose:
print('Skipping %s (skip extension)' % (full_path))
continue
# Min sizes
if min_sizes and cur_extension in min_sizes_keys:
statinfo = os.stat(full_path)
if statinfo.st_size < min_sizes[cur_extension]:
if args.verbose:
print('Skipping %s (min size)' % (full_path))
continue
# Filtered files
if cur_extension in file_ops_keys:
if args.verbose:
print('Filter \'%s\'' % (full_path))
ret = file_ops[cur_extension](full_path, filename, cur_extension,
args.out_dir, args.verbose)
if ret: continue
# Simple copy
if args.max_files_per_temp != -1:
if cur_files_in_out_dir == args.max_files_per_temp:
cur_files_in_out_dir = 0
cur_out_dir = cur_out_dir + 1
outdir = os.path.join(args.out_dir, str(cur_out_dir))
cur_files_in_out_dir = cur_files_in_out_dir + 1
if args.verbose:
print('Std copy %s => %s' % (full_path, outdir))
copy_file(full_path, filename, outdir, verbose=False)
if not args.quiet:
print('Statistics :\n')
for key in sorted(statistics.keys()):
print('\t.%s\t=> %d' % (key, statistics[key]))

Archive Download the corresponding diff file

Branches