Photorec Stage 2

Photorec Stage 2 Git Source Tree

Root/photorec_stage_2.py

1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4"""
5 Copyright 2016 Grégory Soutadé
6
7 This file is part of Photorec Stage 2.
8
9 Photorec Stage 2 is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13
14 Photorec Stage 2 is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with Photorec Stage 2. If not, see <http://www.gnu.org/licenses/>.
21"""
22
23import os
24from zipfile import ZipFile
25import argparse
26import shutil
27from datetime import datetime
28
29OOO_CREATION_DATE_START = '<meta:creation-date>'
30OOO_CREATION_DATE_END = '</meta:creation-date>'
31
32MS_CREATION_DATE_START = '<dcterms:created xsi:type="dcterms:W3CDTF">'
33MS_CREATION_DATE_END = '</dcterms:created>'
34
35OPEN_OFFICE_MIME_TYPES = {
36 'application/vnd.oasis.opendocument.text' : 'odt',
37 'application/vnd.oasis.opendocument.spreadsheet' : 'ods',
38 'application/vnd.oasis.opendocument.presentation' : 'odp',
39 }
40
41statistics = {}
42
43def remove_extension(filename):
44 point_pos = filename.rfind('.')
45
46 if point_pos == -1:
47 return (filename, '')
48
49 extension = filename[point_pos+1:]
50 filename = filename[:point_pos]
51
52 return filename, extension
53
54def copy_file(orig_filepath, filename, outdir, verbose, extension=None):
55 if not os.path.exists(outdir):
56 if verbose:
57 print('mkdirs %s' % (outdir))
58 os.makedirs(outdir)
59
60 if not extension:
61 filename, extension = remove_extension(filename)
62
63 if os.path.exists(os.path.join(outdir, '%s.%s' % (filename, extension))):
64 cur_num = 2
65 while os.path.exists(os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))):
66 cur_num = cur_num + 1
67 filename = os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))
68 else:
69 filename = os.path.join(outdir, '%s.%s' % (filename, extension))
70
71 if verbose:
72 print('\tCopy %s => %s' % (orig_filepath, filename))
73
74 statistics[extension] = statistics.get(extension, 0) + 1
75
76 shutil.copy(orig_filepath, filename)
77
78def copy_datetime_file(dt, orig_filepath, outdir, verbose, extension):
79 outdir = os.path.join(outdir, str(dt.year), '%02d' % dt.month)
80 filename = '%d_%02d_%02d' % (dt.year, dt.month, dt.day)
81 return copy_file(orig_filepath, filename, outdir, verbose, extension)
82
83def _try_open_office(orig_filepath, zipfile, zipname, filename, outdir, verbose):
84 with zipfile.open(zipname) as mimetype:
85 mime = mimetype.read()
86 if not mime in OPEN_OFFICE_MIME_TYPES.keys(): return False
87
88 ext = OPEN_OFFICE_MIME_TYPES[mime]
89
90 if verbose:
91 print('Found %s file' % (ext))
92
93 try:
94 meta = zipfile.open('meta.xml')
95 except KeyError:
96 return False
97
98 outdir = os.path.join(outdir, 'office')
99
100 metadata = meta.read()
101 try:
102 start = metadata.index(OOO_CREATION_DATE_START)
103 end = metadata.index(OOO_CREATION_DATE_END)
104
105 creation_date = datetime.strptime(metadata[start+len(OOO_CREATION_DATE_START):end], '%Y-%m-%dT%H:%M:%S')
106 copy_datetime_file(creation_date, orig_filepath, outdir, verbose, ext)
107 except:
108 copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, ext)
109
110 meta.close()
111 return True
112 return False
113
114def _try_ms_office(orig_filepath, zipfile, filename, outdir, verbose, extension):
115 try:
116 meta = zipfile.open('docProps/core.xml')
117 except KeyError:
118 return False
119
120 outdir = os.path.join(outdir, 'office')
121
122 metadata = meta.read()
123 try:
124 start = metadata.index(MS_CREATION_DATE_START)
125 end = metadata.index(MS_CREATION_DATE_END)
126
127 creation_date = datetime.strptime(metadata[start+len(MS_CREATION_DATE_START):end][:19], '%Y-%m-%dT%H:%M:%S')
128 copy_datetime_file(creation_date, orig_filepath, outdir, verbose, extension)
129 except:
130 copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, extension)
131
132 meta.close()
133
134 return True
135
136def manage_zip(orig_filepath, filename, extension, outdir, verbose=False):
137 ret = False
138 with ZipFile(orig_filepath, 'r') as myzip:
139 for name in myzip.namelist():
140 # Maybe an Open Document file
141 if name == 'mimetype':
142 ret = _try_open_office(orig_filepath, myzip, name, filename, outdir, verbose)
143 if ret: return ret
144 if name.startswith('word/'):
145 ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'docx')
146 if ret: return ret
147 if name.startswith('xl/'):
148 ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'xslx')
149 if ret: return ret
150 if name.startswith('ppt/'):
151 ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'pptx')
152 if ret: return ret
153
154 return ret
155
156def _escape_name(name):
157 name = name.replace('/', '')
158 name = name.replace('\\', '')
159
160 return name
161
162def _append_tag(filename, tag):
163 if not tag: return filename
164
165 if filename:
166 filename = '%s - ' % filename
167
168 if type(tag) == int:
169 filename ='%s%d' % (filename, tag)
170 else:
171 filename ='%s%s' % (filename, _escape_name(tag))
172
173 return filename
174
175def manage_audio(orig_filepath, filename, extension, outdir, verbose=False):
176 tags = None
177 try:
178 tags = eyed3.load(orig_filepath).tag
179 except:
180 return False
181
182 if not tags or (not tags.artist and not tags.album and not tags.title):
183 return False
184
185 outdir = os.path.join(outdir, 'multimedia', 'audio')
186
187 new_filename = ''
188 if tags.track_num[0]: new_filename = _append_tag('', tags.track_num[0])
189 new_filename = _append_tag(new_filename, tags.title)
190 new_filename = '%s.%s' % (new_filename, extension)
191
192 subdir = False
193 if tags.artist:
194 outdir = os.path.join(outdir, _escape_name(tags.artist))
195 subdir = True
196 if tags.album:
197 outdir = os.path.join(outdir, _escape_name(tags.album))
198 subdir = True
199
200 if not subdir:
201 outdir = os.path.join(outdir, 'unknown')
202
203 copy_file(orig_filepath, new_filename, outdir, verbose)
204
205 return True
206
207def manage_picture(orig_filepath, filename, extension, outdir, verbose=False):
208 tags = None
209
210 with open(orig_filepath, 'rb') as f:
211 try:
212 tags = exifread.process_file(f)
213 except:
214 return False
215
216 outdir = os.path.join(outdir, 'multimedia', 'pictures')
217
218 dt = None
219 try:
220 dt = datetime.strptime(tags['EXIF DateTimeOriginal'].values, '%Y:%m:%d %H:%M:%S')
221 except KeyError, e: # No 'EXIF DateTimeOriginal'
222 return False
223 except:
224 print 'Invalid date format \'%s\'' % tags['EXIF DateTimeOriginal']
225 return False
226
227 copy_datetime_file(dt, orig_filepath, outdir, verbose, extension)
228 return True
229
230_multiplier_bytes = (1, 1024, 1024*1024, 1024*1024*1024)
231_multiplier_bits = (1, 1000, 1000*1000, 1000*1000*1000)
232
233def parse_min_sizes(min_sizes_comma):
234 if not min_sizes_comma: return {}
235 min_sizes = {}
236 for element in min_sizes_comma.split(','):
237 sub_elements = element.split(':')
238 if len(sub_elements) != 2:
239 raise Exception('Invalid parameter --min-size \'%s\'' % (element))
240 extension = sub_elements[0]
241 size = sub_elements[1]
242 multiplier_idx = 0
243 multiplier_tab = _multiplier_bytes
244 if size[-1:] == 'b' or size[-1:] == 'B':
245 size = size[:-1]
246 if size[-1:] == 'i':
247 size = size[:-1]
248 multiplier_tab = _multiplier_bits
249 if size[-1:] == 'k' or size[-1:] == 'K':
250 multiplier_idx = 1
251 size = size[:-1]
252 elif size[-1:] == 'm' or size[-1:] == 'M':
253 multiplier_idx = 2
254 size = size[:-1]
255 elif size[-1:] == 'g' or size[-1:] == 'G':
256 multiplier_idx = 2
257 size = size[:-1]
258
259 try:
260 size = int(size)
261 except:
262 raise Exception('Invalid parameter --min-size \'%s\'' % (element))
263
264 min_sizes[extension] = size * multiplier_tab[multiplier_idx]
265
266 return min_sizes
267
268parser = argparse.ArgumentParser(description='Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg).')
269
270parser.add_argument('--in', dest='in_dir', help='Directory in (with photorec results)', required=True)
271parser.add_argument('--out', dest='out_dir', help='Directory out (script results)', required=True)
272parser.add_argument('--max-files-per-temp', dest='max_files_per_temp',
273 help='Maximum unknown files in temprorary directory, -1 for no temp dir',
274 type=int, default=50)
275parser.add_argument('--skip-ext', dest='skip_ext', help='Don\'t copy some extensions (comma separated eg : mp3,txt,doc)', default='')
276parser.add_argument('--only-ext', dest='only_ext', help='Copy some extensions (comma separated eg : mp3,txt,doc)', default='')
277parser.add_argument('--min-size', dest='min_size', help='Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8)', default='')
278parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode')
279parser.add_argument('--quiet', dest='quiet', action='store_true', default=False, help='Quiet mode')
280
281args = parser.parse_args()
282
283file_ops = {
284 'zip': manage_zip,
285}
286
287try:
288 import eyed3
289 file_ops['mp3'] = manage_audio
290except:
291 print('Package eyed3 not installed, mp3 format not supported. Use pip install eyed3')
292
293try:
294 import exifread
295 file_ops['jpg'] = manage_picture
296except:
297 print('Package exifread not installed, jpg format not supported. Use pip install exifread')
298
299file_ops_keys = file_ops.keys()
300
301cur_out_dir = 0
302cur_files_in_out_dir = 0
303skip_exts = args.skip_ext and args.skip_ext.split(',') or None
304only_exts = args.only_ext and args.only_ext.split(',') or None
305
306min_sizes = parse_min_sizes(args.min_size)
307min_sizes_keys = min_sizes.keys()
308
309# Disable (force) verbose on quiet
310if args.quiet: args.verbose = False
311
312if args.max_files_per_temp == -1:
313 outdir = args.out_dir
314
315for root, dirs, files in os.walk(args.in_dir):
316 for filename in files:
317
318 full_path = os.path.join(root, filename)
319 _, cur_extension = remove_extension(filename)
320
321 # Only some extensions
322 if only_exts and cur_extension not in only_exts:
323 if args.verbose:
324 print('Skipping %s (only extension)' % (full_path))
325 continue
326
327 # Skipping some extensions
328 if skip_exts and cur_extension in skip_exts:
329 if args.verbose:
330 print('Skipping %s (skip extension)' % (full_path))
331 continue
332
333 # Min sizes
334 if min_sizes and cur_extension in min_sizes_keys:
335 statinfo = os.stat(full_path)
336 if statinfo.st_size < min_sizes[cur_extension]:
337 if args.verbose:
338 print('Skipping %s (min size)' % (full_path))
339 continue
340
341 # Filtered files
342 if cur_extension in file_ops_keys:
343 if args.verbose:
344 print('Filter \'%s\'' % (full_path))
345 ret = file_ops[cur_extension](full_path, filename, cur_extension,
346 args.out_dir, args.verbose)
347
348 if ret: continue
349
350 # Simple copy
351 if args.max_files_per_temp != -1:
352 if cur_files_in_out_dir == args.max_files_per_temp:
353 cur_files_in_out_dir = 0
354 cur_out_dir = cur_out_dir + 1
355 outdir = os.path.join(args.out_dir, str(cur_out_dir))
356 cur_files_in_out_dir = cur_files_in_out_dir + 1
357
358 if args.verbose:
359 print('Std copy %s => %s' % (full_path, outdir))
360
361 copy_file(full_path, filename, outdir, verbose=False)
362
363if not args.quiet:
364 print('Statistics :\n')
365 for key in sorted(statistics.keys()):
366 print('\t.%s\t=> %d' % (key, statistics[key]))

Archive Download this file

Branches