1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 """
39 Provides filesystem-related objects.
40 @sort: FilesystemList, BackupFileList, PurgeItemList
41 @author: Kenneth J. Pronovici <pronovic@ieee.org>
42 """
43
44
45
46
47
48
49
50 import os
51 import re
52 import math
53 import logging
54 import tarfile
55 import hashlib
56
57
58 from CedarBackup3.knapsack import firstFit, bestFit, worstFit, alternateFit
59 from CedarBackup3.util import AbsolutePathList, UnorderedList, RegexList
60 from CedarBackup3.util import removeKeys, displayBytes, calculateFileAge, encodePath, dereferenceLink
61
62
63
64
65
66
67 logger = logging.getLogger("CedarBackup3.log.filesystem")
75
76
77
78
79
80 """
81 Represents a list of filesystem items.
82
83 This is a generic class that represents a list of filesystem items. Callers
84 can add individual files or directories to the list, or can recursively add
85 the contents of a directory. The class also allows for up-front exclusions
86 in several forms (all files, all directories, all items matching a pattern,
87 all items whose basename matches a pattern, or all directories containing a
88 specific "ignore file"). Symbolic links are typically backed up
89 non-recursively, i.e. the link to a directory is backed up, but not the
90 contents of that link (we don't want to deal with recursive loops, etc.).
91
92 The custom methods such as L{addFile} will only add items if they exist on
93 the filesystem and do not match any exclusions that are already in place.
94 However, since a FilesystemList is a subclass of Python's standard list
95 class, callers can also add items to the list in the usual way, using
96 methods like C{append()} or C{insert()}. No validations apply to items
97 added to the list in this way; however, many list-manipulation methods deal
98 "gracefully" with items that don't exist in the filesystem, often by
99 ignoring them.
100
101 Once a list has been created, callers can remove individual items from the
102 list using standard methods like C{pop()} or C{remove()} or they can use
103 custom methods to remove specific types of entries or entries which match a
104 particular pattern.
105
106 @note: Regular expression patterns that apply to paths are assumed to be
107 bounded at front and back by the beginning and end of the string, i.e. they
108 are treated as if they begin with C{^} and end with C{$}. This is true
109 whether we are matching a complete path or a basename.
110
111 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
112 removeLinks, removeMatch, removeInvalid, normalize,
113 excludeFiles, excludeDirs, excludeLinks, excludePaths,
114 excludePatterns, excludeBasenamePatterns, ignoreFile
115 """
116
117
118
119
120
121
139
140
141
142
143
144
146 """
147 Property target used to set the exclude files flag.
148 No validations, but we normalize the value to C{True} or C{False}.
149 """
150 if value:
151 self._excludeFiles = True
152 else:
153 self._excludeFiles = False
154
156 """
157 Property target used to get the exclude files flag.
158 """
159 return self._excludeFiles
160
162 """
163 Property target used to set the exclude directories flag.
164 No validations, but we normalize the value to C{True} or C{False}.
165 """
166 if value:
167 self._excludeDirs = True
168 else:
169 self._excludeDirs = False
170
172 """
173 Property target used to get the exclude directories flag.
174 """
175 return self._excludeDirs
176
178 """
179 Property target used to set the exclude soft links flag.
180 No validations, but we normalize the value to C{True} or C{False}.
181 """
182 if value:
183 self._excludeLinks = True
184 else:
185 self._excludeLinks = False
186
188 """
189 Property target used to get the exclude soft links flag.
190 """
191 return self._excludeLinks
192
194 """
195 Property target used to set the exclude paths list.
196 A C{None} value is converted to an empty list.
197 Elements do not have to exist on disk at the time of assignment.
198 @raise ValueError: If any list element is not an absolute path.
199 """
200 self._excludePaths = AbsolutePathList()
201 if value is not None:
202 self._excludePaths.extend(value)
203
205 """
206 Property target used to get the absolute exclude paths list.
207 """
208 return self._excludePaths
209
211 """
212 Property target used to set the exclude patterns list.
213 A C{None} value is converted to an empty list.
214 """
215 self._excludePatterns = RegexList()
216 if value is not None:
217 self._excludePatterns.extend(value)
218
220 """
221 Property target used to get the exclude patterns list.
222 """
223 return self._excludePatterns
224
226 """
227 Property target used to set the exclude basename patterns list.
228 A C{None} value is converted to an empty list.
229 """
230 self._excludeBasenamePatterns = RegexList()
231 if value is not None:
232 self._excludeBasenamePatterns.extend(value)
233
235 """
236 Property target used to get the exclude basename patterns list.
237 """
238 return self._excludeBasenamePatterns
239
241 """
242 Property target used to set the ignore file.
243 The value must be a non-empty string if it is not C{None}.
244 @raise ValueError: If the value is an empty string.
245 """
246 if value is not None:
247 if len(value) < 1:
248 raise ValueError("The ignore file must be a non-empty string.")
249 self._ignoreFile = value
250
252 """
253 Property target used to get the ignore file.
254 """
255 return self._ignoreFile
256
257 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
258 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
259 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
260 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
261 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
262 "List of regular expression patterns (matching complete path) to be excluded.")
263 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
264 None, "List of regular expression patterns (matching basename) to be excluded.")
265 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
266
267
268
269
270
271
273 """
274 Adds a file to the list.
275
276 The path must exist and must be a file or a link to an existing file. It
277 will be added to the list subject to any exclusions that are in place.
278
279 @param path: File path to be added to the list
280 @type path: String representing a path on disk
281
282 @return: Number of items added to the list.
283
284 @raise ValueError: If path is not a file or does not exist.
285 @raise ValueError: If the path could not be encoded properly.
286 """
287 path = encodePath(path)
288 if not os.path.exists(path) or not os.path.isfile(path):
289 logger.debug("Path [%s] is not a file or does not exist on disk.", path)
290 raise ValueError("Path is not a file or does not exist on disk.")
291 if self.excludeLinks and os.path.islink(path):
292 logger.debug("Path [%s] is excluded based on excludeLinks.", path)
293 return 0
294 if self.excludeFiles:
295 logger.debug("Path [%s] is excluded based on excludeFiles.", path)
296 return 0
297 if path in self.excludePaths:
298 logger.debug("Path [%s] is excluded based on excludePaths.", path)
299 return 0
300 for pattern in self.excludePatterns:
301 pattern = encodePath(pattern)
302 if re.compile(r"^%s$" % pattern).match(path):
303 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
304 return 0
305 for pattern in self.excludeBasenamePatterns:
306 pattern = encodePath(pattern)
307 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
308 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
309 return 0
310 self.append(path)
311 logger.debug("Added file to list: [%s]", path)
312 return 1
313
315 """
316 Adds a directory to the list.
317
318 The path must exist and must be a directory or a link to an existing
319 directory. It will be added to the list subject to any exclusions that
320 are in place. The L{ignoreFile} does not apply to this method, only to
321 L{addDirContents}.
322
323 @param path: Directory path to be added to the list
324 @type path: String representing a path on disk
325
326 @return: Number of items added to the list.
327
328 @raise ValueError: If path is not a directory or does not exist.
329 @raise ValueError: If the path could not be encoded properly.
330 """
331 path = encodePath(path)
332 path = normalizeDir(path)
333 if not os.path.exists(path) or not os.path.isdir(path):
334 logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
335 raise ValueError("Path is not a directory or does not exist on disk.")
336 if self.excludeLinks and os.path.islink(path):
337 logger.debug("Path [%s] is excluded based on excludeLinks.", path)
338 return 0
339 if self.excludeDirs:
340 logger.debug("Path [%s] is excluded based on excludeDirs.", path)
341 return 0
342 if path in self.excludePaths:
343 logger.debug("Path [%s] is excluded based on excludePaths.", path)
344 return 0
345 for pattern in self.excludePatterns:
346 pattern = encodePath(pattern)
347 if re.compile(r"^%s$" % pattern).match(path):
348 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
349 return 0
350 for pattern in self.excludeBasenamePatterns:
351 pattern = encodePath(pattern)
352 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
353 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
354 return 0
355 self.append(path)
356 logger.debug("Added directory to list: [%s]", path)
357 return 1
358
359 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
360 """
361 Adds the contents of a directory to the list.
362
363 The path must exist and must be a directory or a link to a directory.
364 The contents of the directory (as well as the directory path itself) will
365 be recursively added to the list, subject to any exclusions that are in
366 place. If you only want the directory and its immediate contents to be
367 added, then pass in C{recursive=False}.
368
369 @note: If a directory's absolute path matches an exclude pattern or path,
370 or if the directory contains the configured ignore file, then the
371 directory and all of its contents will be recursively excluded from the
372 list.
373
374 @note: If the passed-in directory happens to be a soft link, it will be
375 recursed. However, the linkDepth parameter controls whether any soft
376 links I{within} the directory will be recursed. The link depth is
377 maximum depth of the tree at which soft links should be followed. So, a
378 depth of 0 does not follow any soft links, a depth of 1 follows only
379 links within the passed-in directory, a depth of 2 follows the links at
380 the next level down, etc.
381
382 @note: Any invalid soft links (i.e. soft links that point to
383 non-existent items) will be silently ignored.
384
385 @note: The L{excludeDirs} flag only controls whether any given directory
386 path itself is added to the list once it has been discovered. It does
387 I{not} modify any behavior related to directory recursion.
388
389 @note: If you call this method I{on a link to a directory} that link will
390 never be dereferenced (it may, however, be followed).
391
392 @param path: Directory path whose contents should be added to the list
393 @type path: String representing a path on disk
394
395 @param recursive: Indicates whether directory contents should be added recursively.
396 @type recursive: Boolean value
397
398 @param addSelf: Indicates whether the directory itself should be added to the list.
399 @type addSelf: Boolean value
400
401 @param linkDepth: Maximum depth of the tree at which soft links should be followed
402 @type linkDepth: Integer value, where zero means not to follow any soft links
403
404 @param dereference: Indicates whether soft links, if followed, should be dereferenced
405 @type dereference: Boolean value
406
407 @return: Number of items recursively added to the list
408
409 @raise ValueError: If path is not a directory or does not exist.
410 @raise ValueError: If the path could not be encoded properly.
411 """
412 path = encodePath(path)
413 path = normalizeDir(path)
414 return self._addDirContentsInternal(path, addSelf, recursive, linkDepth, dereference)
415
416 - def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0, dereference=False):
417 """
418 Internal implementation of C{addDirContents}.
419
420 This internal implementation exists due to some refactoring. Basically,
421 some subclasses have a need to add the contents of a directory, but not
422 the directory itself. This is different than the standard C{FilesystemList}
423 behavior and actually ends up making a special case out of the first
424 call in the recursive chain. Since I don't want to expose the modified
425 interface, C{addDirContents} ends up being wholly implemented in terms
426 of this method.
427
428 The linkDepth parameter controls whether soft links are followed when we
429 are adding the contents recursively. Any recursive calls reduce the
430 value by one. If the value zero or less, then soft links will just be
431 added as directories, but will not be followed. This means that links
432 are followed to a I{constant depth} starting from the top-most directory.
433
434 There is one difference between soft links and directories: soft links
435 that are added recursively are not placed into the list explicitly. This
436 is because if we do add the links recursively, the resulting tar file
437 gets a little confused (it has a link and a directory with the same
438 name).
439
440 @note: If you call this method I{on a link to a directory} that link will
441 never be dereferenced (it may, however, be followed).
442
443 @param path: Directory path whose contents should be added to the list.
444 @param includePath: Indicates whether to include the path as well as contents.
445 @param recursive: Indicates whether directory contents should be added recursively.
446 @param linkDepth: Depth of soft links that should be followed
447 @param dereference: Indicates whether soft links, if followed, should be dereferenced
448
449 @return: Number of items recursively added to the list
450
451 @raise ValueError: If path is not a directory or does not exist.
452 """
453 added = 0
454 if not os.path.exists(path) or not os.path.isdir(path):
455 logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
456 raise ValueError("Path is not a directory or does not exist on disk.")
457 if path in self.excludePaths:
458 logger.debug("Path [%s] is excluded based on excludePaths.", path)
459 return added
460 for pattern in self.excludePatterns:
461 pattern = encodePath(pattern)
462 if re.compile(r"^%s$" % pattern).match(path):
463 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
464 return added
465 for pattern in self.excludeBasenamePatterns:
466 pattern = encodePath(pattern)
467 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
468 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
469 return added
470 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
471 logger.debug("Path [%s] is excluded based on ignore file.", path)
472 return added
473 if includePath:
474 added += self.addDir(path)
475 for entry in os.listdir(path):
476 entrypath = os.path.join(path, entry)
477 if os.path.isfile(entrypath):
478 if linkDepth > 0 and dereference:
479 derefpath = dereferenceLink(entrypath)
480 if derefpath != entrypath:
481 added += self.addFile(derefpath)
482 added += self.addFile(entrypath)
483 elif os.path.isdir(entrypath):
484 if os.path.islink(entrypath):
485 if recursive:
486 if linkDepth > 0:
487 newDepth = linkDepth - 1
488 if dereference:
489 derefpath = dereferenceLink(entrypath)
490 if derefpath != entrypath:
491 added += self._addDirContentsInternal(derefpath, True, recursive, newDepth, dereference)
492 added += self.addDir(entrypath)
493 else:
494 added += self._addDirContentsInternal(entrypath, False, recursive, newDepth, dereference)
495 else:
496 added += self.addDir(entrypath)
497 else:
498 added += self.addDir(entrypath)
499 else:
500 if recursive:
501 newDepth = linkDepth - 1
502 added += self._addDirContentsInternal(entrypath, True, recursive, newDepth, dereference)
503 else:
504 added += self.addDir(entrypath)
505 return added
506
507
508
509
510
511
513 """
514 Removes file entries from the list.
515
516 If C{pattern} is not passed in or is C{None}, then all file entries will
517 be removed from the list. Otherwise, only those file entries matching
518 the pattern will be removed. Any entry which does not exist on disk
519 will be ignored (use L{removeInvalid} to purge those entries).
520
521 This method might be fairly slow for large lists, since it must check the
522 type of each item in the list. If you know ahead of time that you want
523 to exclude all files, then you will be better off setting L{excludeFiles}
524 to C{True} before adding items to the list.
525
526 @param pattern: Regular expression pattern representing entries to remove
527
528 @return: Number of entries removed
529 @raise ValueError: If the passed-in pattern is not a valid regular expression.
530 """
531 removed = 0
532 if pattern is None:
533 for entry in self[:]:
534 if os.path.exists(entry) and os.path.isfile(entry):
535 self.remove(entry)
536 logger.debug("Removed path [%s] from list.", entry)
537 removed += 1
538 else:
539 try:
540 pattern = encodePath(pattern)
541 compiled = re.compile(pattern)
542 except re.error:
543 raise ValueError("Pattern is not a valid regular expression.")
544 for entry in self[:]:
545 if os.path.exists(entry) and os.path.isfile(entry):
546 if compiled.match(entry):
547 self.remove(entry)
548 logger.debug("Removed path [%s] from list.", entry)
549 removed += 1
550 logger.debug("Removed a total of %d entries.", removed)
551 return removed
552
554 """
555 Removes directory entries from the list.
556
557 If C{pattern} is not passed in or is C{None}, then all directory entries
558 will be removed from the list. Otherwise, only those directory entries
559 matching the pattern will be removed. Any entry which does not exist on
560 disk will be ignored (use L{removeInvalid} to purge those entries).
561
562 This method might be fairly slow for large lists, since it must check the
563 type of each item in the list. If you know ahead of time that you want
564 to exclude all directories, then you will be better off setting
565 L{excludeDirs} to C{True} before adding items to the list (note that this
566 will not prevent you from recursively adding the I{contents} of
567 directories).
568
569 @param pattern: Regular expression pattern representing entries to remove
570
571 @return: Number of entries removed
572 @raise ValueError: If the passed-in pattern is not a valid regular expression.
573 """
574 removed = 0
575 if pattern is None:
576 for entry in self[:]:
577 if os.path.exists(entry) and os.path.isdir(entry):
578 self.remove(entry)
579 logger.debug("Removed path [%s] from list.", entry)
580 removed += 1
581 else:
582 try:
583 pattern = encodePath(pattern)
584 compiled = re.compile(pattern)
585 except re.error:
586 raise ValueError("Pattern is not a valid regular expression.")
587 for entry in self[:]:
588 if os.path.exists(entry) and os.path.isdir(entry):
589 if compiled.match(entry):
590 self.remove(entry)
591 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
592 removed += 1
593 logger.debug("Removed a total of %d entries.", removed)
594 return removed
595
597 """
598 Removes soft link entries from the list.
599
600 If C{pattern} is not passed in or is C{None}, then all soft link entries
601 will be removed from the list. Otherwise, only those soft link entries
602 matching the pattern will be removed. Any entry which does not exist on
603 disk will be ignored (use L{removeInvalid} to purge those entries).
604
605 This method might be fairly slow for large lists, since it must check the
606 type of each item in the list. If you know ahead of time that you want
607 to exclude all soft links, then you will be better off setting
608 L{excludeLinks} to C{True} before adding items to the list.
609
610 @param pattern: Regular expression pattern representing entries to remove
611
612 @return: Number of entries removed
613 @raise ValueError: If the passed-in pattern is not a valid regular expression.
614 """
615 removed = 0
616 if pattern is None:
617 for entry in self[:]:
618 if os.path.exists(entry) and os.path.islink(entry):
619 self.remove(entry)
620 logger.debug("Removed path [%s] from list.", entry)
621 removed += 1
622 else:
623 try:
624 pattern = encodePath(pattern)
625 compiled = re.compile(pattern)
626 except re.error:
627 raise ValueError("Pattern is not a valid regular expression.")
628 for entry in self[:]:
629 if os.path.exists(entry) and os.path.islink(entry):
630 if compiled.match(entry):
631 self.remove(entry)
632 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
633 removed += 1
634 logger.debug("Removed a total of %d entries.", removed)
635 return removed
636
638 """
639 Removes from the list all entries matching a pattern.
640
641 This method removes from the list all entries which match the passed in
642 C{pattern}. Since there is no need to check the type of each entry, it
643 is faster to call this method than to call the L{removeFiles},
644 L{removeDirs} or L{removeLinks} methods individually. If you know which
645 patterns you will want to remove ahead of time, you may be better off
646 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
647 items to the list.
648
649 @note: Unlike when using the exclude lists, the pattern here is I{not}
650 bounded at the front and the back of the string. You can use any pattern
651 you want.
652
653 @param pattern: Regular expression pattern representing entries to remove
654
655 @return: Number of entries removed.
656 @raise ValueError: If the passed-in pattern is not a valid regular expression.
657 """
658 try:
659 pattern = encodePath(pattern)
660 compiled = re.compile(pattern)
661 except re.error:
662 raise ValueError("Pattern is not a valid regular expression.")
663 removed = 0
664 for entry in self[:]:
665 if compiled.match(entry):
666 self.remove(entry)
667 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
668 removed += 1
669 logger.debug("Removed a total of %d entries.", removed)
670 return removed
671
673 """
674 Removes from the list all entries that do not exist on disk.
675
676 This method removes from the list all entries which do not currently
677 exist on disk in some form. No attention is paid to whether the entries
678 are files or directories.
679
680 @return: Number of entries removed.
681 """
682 removed = 0
683 for entry in self[:]:
684 if not os.path.exists(entry):
685 self.remove(entry)
686 logger.debug("Removed path [%s] from list.", entry)
687 removed += 1
688 logger.debug("Removed a total of %d entries.", removed)
689 return removed
690
691
692
693
694
695
697 """Normalizes the list, ensuring that each entry is unique."""
698 orig = len(self)
699 self.sort()
700 dups = list(filter(lambda x, self=self: self[x] == self[x+1], list(range(0, len(self) - 1))))
701 items = list(map(lambda x, self=self: self[x], dups))
702 list(map(self.remove, items))
703 new = len(self)
704 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now).", new-orig, orig, new)
705
707 """
708 Verifies that all entries in the list exist on disk.
709 @return: C{True} if all entries exist, C{False} otherwise.
710 """
711 for entry in self:
712 if not os.path.exists(entry):
713 logger.debug("Path [%s] is invalid; list is not valid.", entry)
714 return False
715 logger.debug("All entries in list are valid.")
716 return True
717
718
719
720
721
722
723 -class SpanItem(object):
724 """
725 Item returned by L{BackupFileList.generateSpan}.
726 """
727 - def __init__(self, fileList, size, capacity, utilization):
728 """
729 Create object.
730 @param fileList: List of files
731 @param size: Size (in bytes) of files
732 @param utilization: Utilization, as a percentage (0-100)
733 """
734 self.fileList = fileList
735 self.size = size
736 self.capacity = capacity
737 self.utilization = utilization
738
745
746
747
748
749
750 """
751 List of files to be backed up.
752
753 A BackupFileList is a L{FilesystemList} containing a list of files to be
754 backed up. It only contains files, not directories (soft links are treated
755 like files). On top of the generic functionality provided by
756 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
757 for each file in the list, and it also provides a method to calculate the
758 total size of the files in the list and a way to export the list into tar
759 form.
760
761 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
762 generateFitted, generateTarfile, removeUnchanged
763 """
764
765
766
767
768
772
773
774
775
776
777
779 """
780 Adds a directory to the list.
781
782 Note that this class does not allow directories to be added by themselves
783 (a backup list contains only files). However, since links to directories
784 are technically files, we allow them to be added.
785
786 This method is implemented in terms of the superclass method, with one
787 additional validation: the superclass method is only called if the
788 passed-in path is both a directory and a link. All of the superclass's
789 existing validations and restrictions apply.
790
791 @param path: Directory path to be added to the list
792 @type path: String representing a path on disk
793
794 @return: Number of items added to the list.
795
796 @raise ValueError: If path is not a directory or does not exist.
797 @raise ValueError: If the path could not be encoded properly.
798 """
799 path = encodePath(path)
800 path = normalizeDir(path)
801 if os.path.isdir(path) and not os.path.islink(path):
802 return 0
803 else:
804 return FilesystemList.addDir(self, path)
805
806
807
808
809
810
812 """
813 Returns the total size among all files in the list.
814 Only files are counted.
815 Soft links that point at files are ignored.
816 Entries which do not exist on disk are ignored.
817 @return: Total size, in bytes
818 """
819 total = 0.0
820 for entry in self:
821 if os.path.isfile(entry) and not os.path.islink(entry):
822 total += float(os.stat(entry).st_size)
823 return total
824
826 """
827 Generates a mapping from file to file size in bytes.
828 The mapping does include soft links, which are listed with size zero.
829 Entries which do not exist on disk are ignored.
830 @return: Dictionary mapping file to file size
831 """
832 table = { }
833 for entry in self:
834 if os.path.islink(entry):
835 table[entry] = 0.0
836 elif os.path.isfile(entry):
837 table[entry] = float(os.stat(entry).st_size)
838 return table
839
841 """
842 Generates a mapping from file to file digest.
843
844 Currently, the digest is an SHA hash, which should be pretty secure. In
845 the future, this might be a different kind of hash, but we guarantee that
846 the type of the hash will not change unless the library major version
847 number is bumped.
848
849 Entries which do not exist on disk are ignored.
850
851 Soft links are ignored. We would end up generating a digest for the file
852 that the soft link points at, which doesn't make any sense.
853
854 If C{stripPrefix} is passed in, then that prefix will be stripped from
855 each key when the map is generated. This can be useful in generating two
856 "relative" digest maps to be compared to one another.
857
858 @param stripPrefix: Common prefix to be stripped from paths
859 @type stripPrefix: String with any contents
860
861 @return: Dictionary mapping file to digest value
862 @see: L{removeUnchanged}
863 """
864 table = { }
865 if stripPrefix is not None:
866 for entry in self:
867 if os.path.isfile(entry) and not os.path.islink(entry):
868 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
869 else:
870 for entry in self:
871 if os.path.isfile(entry) and not os.path.islink(entry):
872 table[entry] = BackupFileList._generateDigest(entry)
873 return table
874
875 @staticmethod
877 """
878 Generates an SHA digest for a given file on disk.
879
880 The original code for this function used this simplistic implementation,
881 which requires reading the entire file into memory at once in order to
882 generate a digest value::
883
884 sha.new(open(path).read()).hexdigest()
885
886 Not surprisingly, this isn't an optimal solution. The U{Simple file
887 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
888 Python Cookbook recipe describes how to incrementally generate a hash
889 value by reading in chunks of data rather than reading the file all at
890 once. The recipe relies on the the C{update()} method of the various
891 Python hashing algorithms.
892
893 In my tests using a 110 MB file on CD, the original implementation
894 requires 111 seconds. This implementation requires only 40-45 seconds,
895 which is a pretty substantial speed-up.
896
897 Experience shows that reading in around 4kB (4096 bytes) at a time yields
898 the best performance. Smaller reads are quite a bit slower, and larger
899 reads don't make much of a difference. The 4kB number makes me a little
900 suspicious, and I think it might be related to the size of a filesystem
901 read at the hardware level. However, I've decided to just hardcode 4096
902 until I have evidence that shows it's worthwhile making the read size
903 configurable.
904
905 @param path: Path to generate digest for.
906
907 @return: ASCII-safe SHA digest for the file.
908 @raise OSError: If the file cannot be opened.
909 """
910
911 s = hashlib.sha1()
912 f = open(path, mode="rb")
913 readBytes = 4096
914 while readBytes > 0:
915 readString = f.read(readBytes)
916 s.update(readString)
917 readBytes = len(readString)
918 f.close()
919 digest = s.hexdigest()
920 logger.debug("Generated digest [%s] for file [%s].", digest, path)
921 return digest
922
924 """
925 Generates a list of items that fit in the indicated capacity.
926
927 Sometimes, callers would like to include every item in a list, but are
928 unable to because not all of the items fit in the space available. This
929 method returns a copy of the list, containing only the items that fit in
930 a given capacity. A copy is returned so that we don't lose any
931 information if for some reason the fitted list is unsatisfactory.
932
933 The fitting is done using the functions in the knapsack module. By
934 default, the first fit algorithm is used, but you can also choose
935 from best fit, worst fit and alternate fit.
936
937 @param capacity: Maximum capacity among the files in the new list
938 @type capacity: Integer, in bytes
939
940 @param algorithm: Knapsack (fit) algorithm to use
941 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
942
943 @return: Copy of list with total size no larger than indicated capacity
944 @raise ValueError: If the algorithm is invalid.
945 """
946 table = self._getKnapsackTable()
947 function = BackupFileList._getKnapsackFunction(algorithm)
948 return function(table, capacity)[0]
949
951 """
952 Splits the list of items into sub-lists that fit in a given capacity.
953
954 Sometimes, callers need split to a backup file list into a set of smaller
955 lists. For instance, you could use this to "span" the files across a set
956 of discs.
957
958 The fitting is done using the functions in the knapsack module. By
959 default, the first fit algorithm is used, but you can also choose
960 from best fit, worst fit and alternate fit.
961
962 @note: If any of your items are larger than the capacity, then it won't
963 be possible to find a solution. In this case, a value error will be
964 raised.
965
966 @param capacity: Maximum capacity among the files in the new list
967 @type capacity: Integer, in bytes
968
969 @param algorithm: Knapsack (fit) algorithm to use
970 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
971
972 @return: List of L{SpanItem} objects.
973
974 @raise ValueError: If the algorithm is invalid.
975 @raise ValueError: If it's not possible to fit some items
976 """
977 spanItems = []
978 function = BackupFileList._getKnapsackFunction(algorithm)
979 table = self._getKnapsackTable(capacity)
980 iteration = 0
981 while len(table) > 0:
982 iteration += 1
983 fit = function(table, capacity)
984 if len(fit[0]) == 0:
985
986 raise ValueError("After iteration %d, unable to add any new items." % iteration)
987 removeKeys(table, fit[0])
988 utilization = (float(fit[1])/float(capacity))*100.0
989 item = SpanItem(fit[0], fit[1], capacity, utilization)
990 spanItems.append(item)
991 return spanItems
992
994 """
995 Converts the list into the form needed by the knapsack algorithms.
996 @return: Dictionary mapping file name to tuple of (file path, file size).
997 """
998 table = { }
999 for entry in self:
1000 if os.path.islink(entry):
1001 table[entry] = (entry, 0.0)
1002 elif os.path.isfile(entry):
1003 size = float(os.stat(entry).st_size)
1004 if capacity is not None:
1005 if size > capacity:
1006 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
1007 table[entry] = (entry, size)
1008 return table
1009
1010 @staticmethod
1012 """
1013 Returns a reference to the function associated with an algorithm name.
1014 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
1015 @param algorithm: Name of the algorithm
1016 @return: Reference to knapsack function
1017 @raise ValueError: If the algorithm name is unknown.
1018 """
1019 if algorithm == "first_fit":
1020 return firstFit
1021 elif algorithm == "best_fit":
1022 return bestFit
1023 elif algorithm == "worst_fit":
1024 return worstFit
1025 elif algorithm == "alternate_fit":
1026 return alternateFit
1027 else:
1028 raise ValueError("Algorithm [%s] is invalid." % algorithm)
1029
1031 """
1032 Creates a tar file containing the files in the list.
1033
1034 By default, this method will create uncompressed tar files. If you pass
1035 in mode C{'targz'}, then it will create gzipped tar files, and if you
1036 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
1037
1038 The tar file will be created as a GNU tar archive, which enables extended
1039 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
1040 the extra functionality out-weighs the disadvantage of not being
1041 "standard".
1042
1043 If you pass in C{flat=True}, then a "flat" archive will be created, and
1044 all of the files will be added to the root of the archive. So, the file
1045 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
1046
1047 By default, the whole method call fails if there are problems adding any
1048 of the files to the archive, resulting in an exception. Under these
1049 circumstances, callers are advised that they might want to call
1050 L{removeInvalid()} and then attempt to extract the tar file a second
1051 time, since the most common cause of failures is a missing file (a file
1052 that existed when the list was built, but is gone again by the time the
1053 tar file is built).
1054
1055 If you want to, you can pass in C{ignore=True}, and the method will
1056 ignore errors encountered when adding individual files to the archive
1057 (but not errors opening and closing the archive itself).
1058
1059 We'll always attempt to remove the tarfile from disk if an exception will
1060 be thrown.
1061
1062 @note: No validation is done as to whether the entries in the list are
1063 files, since only files or soft links should be in an object like this.
1064 However, to be safe, everything is explicitly added to the tar archive
1065 non-recursively so it's safe to include soft links to directories.
1066
1067 @note: The Python C{tarfile} module, which is used internally here, is
1068 supposed to deal properly with long filenames and links. In my testing,
1069 I have found that it appears to be able to add long really long filenames
1070 to archives, but doesn't do a good job reading them back out, even out of
1071 an archive it created. Fortunately, all Cedar Backup does is add files
1072 to archives.
1073
1074 @param path: Path of tar file to create on disk
1075 @type path: String representing a path on disk
1076
1077 @param mode: Tar creation mode
1078 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1079
1080 @param ignore: Indicates whether to ignore certain errors.
1081 @type ignore: Boolean
1082
1083 @param flat: Creates "flat" archive by putting all items in root
1084 @type flat: Boolean
1085
1086 @raise ValueError: If mode is not valid
1087 @raise ValueError: If list is empty
1088 @raise ValueError: If the path could not be encoded properly.
1089 @raise TarError: If there is a problem creating the tar file
1090 """
1091
1092 path = encodePath(path)
1093 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1094 if mode == 'tar': tarmode = "w:"
1095 elif mode == 'targz': tarmode = "w:gz"
1096 elif mode == 'tarbz2': tarmode = "w:bz2"
1097 else: raise ValueError("Mode [%s] is not valid." % mode)
1098 try:
1099 tar = tarfile.open(path, tarmode)
1100 try:
1101 tar.format = tarfile.GNU_FORMAT
1102 except AttributeError:
1103 tar.posix = False
1104 for entry in self:
1105 try:
1106 if flat:
1107 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1108 else:
1109 tar.add(entry, recursive=False)
1110 except tarfile.TarError as e:
1111 if not ignore:
1112 raise e
1113 logger.info("Unable to add file [%s]; going on anyway.", entry)
1114 except OSError as e:
1115 if not ignore:
1116 raise tarfile.TarError(e)
1117 logger.info("Unable to add file [%s]; going on anyway.", entry)
1118 tar.close()
1119 except tarfile.ReadError as e:
1120 try: tar.close()
1121 except: pass
1122 if os.path.exists(path):
1123 try: os.remove(path)
1124 except: pass
1125 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1126 except tarfile.TarError as e:
1127 try: tar.close()
1128 except: pass
1129 if os.path.exists(path):
1130 try: os.remove(path)
1131 except: pass
1132 raise e
1133
1135 """
1136 Removes unchanged entries from the list.
1137
1138 This method relies on a digest map as returned from L{generateDigestMap}.
1139 For each entry in C{digestMap}, if the entry also exists in the current
1140 list I{and} the entry in the current list has the same digest value as in
1141 the map, the entry in the current list will be removed.
1142
1143 This method offers a convenient way for callers to filter unneeded
1144 entries from a list. The idea is that a caller will capture a digest map
1145 from C{generateDigestMap} at some point in time (perhaps the beginning of
1146 the week), and will save off that map using C{pickle} or some other
1147 method. Then, the caller could use this method sometime in the future to
1148 filter out any unchanged files based on the saved-off map.
1149
1150 If C{captureDigest} is passed-in as C{True}, then digest information will
1151 be captured for the entire list before the removal step occurs using the
1152 same rules as in L{generateDigestMap}. The check will involve a lookup
1153 into the complete digest map.
1154
1155 If C{captureDigest} is passed in as C{False}, we will only generate a
1156 digest value for files we actually need to check, and we'll ignore any
1157 entry in the list which isn't a file that currently exists on disk.
1158
1159 The return value varies depending on C{captureDigest}, as well. To
1160 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1161 we'll just return a single value representing the number of entries
1162 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1163 map)}. The returned digest map will be in exactly the form returned by
1164 L{generateDigestMap}.
1165
1166 @note: For performance reasons, this method actually ends up rebuilding
1167 the list from scratch. First, we build a temporary dictionary containing
1168 all of the items from the original list. Then, we remove items as needed
1169 from the dictionary (which is faster than the equivalent operation on a
1170 list). Finally, we replace the contents of the current list based on the
1171 keys left in the dictionary. This should be transparent to the caller.
1172
1173 @param digestMap: Dictionary mapping file name to digest value.
1174 @type digestMap: Map as returned from L{generateDigestMap}.
1175
1176 @param captureDigest: Indicates that digest information should be captured.
1177 @type captureDigest: Boolean
1178
1179 @return: Results as discussed above (format varies based on arguments)
1180 """
1181 if captureDigest:
1182 removed = 0
1183 table = {}
1184 captured = {}
1185 for entry in self:
1186 if os.path.isfile(entry) and not os.path.islink(entry):
1187 table[entry] = BackupFileList._generateDigest(entry)
1188 captured[entry] = table[entry]
1189 else:
1190 table[entry] = None
1191 for entry in list(digestMap.keys()):
1192 if entry in table:
1193 if table[entry] is not None:
1194 digest = table[entry]
1195 if digest == digestMap[entry]:
1196 removed += 1
1197 del table[entry]
1198 logger.debug("Discarded unchanged file [%s].", entry)
1199 self[:] = list(table.keys())
1200 return (removed, captured)
1201 else:
1202 removed = 0
1203 table = {}
1204 for entry in self:
1205 table[entry] = None
1206 for entry in list(digestMap.keys()):
1207 if entry in table:
1208 if os.path.isfile(entry) and not os.path.islink(entry):
1209 digest = BackupFileList._generateDigest(entry)
1210 if digest == digestMap[entry]:
1211 removed += 1
1212 del table[entry]
1213 logger.debug("Discarded unchanged file [%s].", entry)
1214 self[:] = list(table.keys())
1215 return removed
1216
1223
1224
1225
1226
1227
1228 """
1229 List of files and directories to be purged.
1230
1231 A PurgeItemList is a L{FilesystemList} containing a list of files and
1232 directories to be purged. On top of the generic functionality provided by
1233 L{FilesystemList}, this class adds functionality to remove items that are
1234 too young to be purged, and to actually remove each item in the list from
1235 the filesystem.
1236
1237 The other main difference is that when you add a directory's contents to a
1238 purge item list, the directory itself is not added to the list. This way,
1239 if someone asks to purge within in C{/opt/backup/collect}, that directory
1240 doesn't get removed once all of the files within it is gone.
1241 """
1242
1243
1244
1245
1246
1250
1251
1252
1253
1254
1255
1256 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
1257 """
1258 Adds the contents of a directory to the list.
1259
1260 The path must exist and must be a directory or a link to a directory.
1261 The contents of the directory (but I{not} the directory path itself) will
1262 be recursively added to the list, subject to any exclusions that are in
1263 place. If you only want the directory and its contents to be added, then
1264 pass in C{recursive=False}.
1265
1266 @note: If a directory's absolute path matches an exclude pattern or path,
1267 or if the directory contains the configured ignore file, then the
1268 directory and all of its contents will be recursively excluded from the
1269 list.
1270
1271 @note: If the passed-in directory happens to be a soft link, it will be
1272 recursed. However, the linkDepth parameter controls whether any soft
1273 links I{within} the directory will be recursed. The link depth is
1274 maximum depth of the tree at which soft links should be followed. So, a
1275 depth of 0 does not follow any soft links, a depth of 1 follows only
1276 links within the passed-in directory, a depth of 2 follows the links at
1277 the next level down, etc.
1278
1279 @note: Any invalid soft links (i.e. soft links that point to
1280 non-existent items) will be silently ignored.
1281
1282 @note: The L{excludeDirs} flag only controls whether any given soft link
1283 path itself is added to the list once it has been discovered. It does
1284 I{not} modify any behavior related to directory recursion.
1285
1286 @note: The L{excludeDirs} flag only controls whether any given directory
1287 path itself is added to the list once it has been discovered. It does
1288 I{not} modify any behavior related to directory recursion.
1289
1290 @note: If you call this method I{on a link to a directory} that link will
1291 never be dereferenced (it may, however, be followed).
1292
1293 @param path: Directory path whose contents should be added to the list
1294 @type path: String representing a path on disk
1295
1296 @param recursive: Indicates whether directory contents should be added recursively.
1297 @type recursive: Boolean value
1298
1299 @param addSelf: Ignored in this subclass.
1300
1301 @param linkDepth: Depth of soft links that should be followed
1302 @type linkDepth: Integer value, where zero means not to follow any soft links
1303
1304 @param dereference: Indicates whether soft links, if followed, should be dereferenced
1305 @type dereference: Boolean value
1306
1307 @return: Number of items recursively added to the list
1308
1309 @raise ValueError: If path is not a directory or does not exist.
1310 @raise ValueError: If the path could not be encoded properly.
1311 """
1312 path = encodePath(path)
1313 path = normalizeDir(path)
1314 return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth, dereference)
1315
1316
1317
1318
1319
1320
1322 """
1323 Removes from the list files younger than a certain age (in days).
1324
1325 Any file whose "age" in days is less than (C{<}) the value of the
1326 C{daysOld} parameter will be removed from the list so that it will not be
1327 purged later when L{purgeItems} is called. Directories and soft links
1328 will be ignored.
1329
1330 The "age" of a file is the amount of time since the file was last used,
1331 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1332
1333 @note: Some people find the "sense" of this method confusing or
1334 "backwards". Keep in mind that this method is used to remove items
1335 I{from the list}, not from the filesystem! It removes from the list
1336 those items that you would I{not} want to purge because they are too
1337 young. As an example, passing in C{daysOld} of zero (0) would remove
1338 from the list no files, which would result in purging all of the files
1339 later. I would be happy to make a synonym of this method with an
1340 easier-to-understand "sense", if someone can suggest one.
1341
1342 @param daysOld: Minimum age of files that are to be kept in the list.
1343 @type daysOld: Integer value >= 0.
1344
1345 @return: Number of entries removed
1346 """
1347 removed = 0
1348 daysOld = int(daysOld)
1349 if daysOld < 0:
1350 raise ValueError("Days old value must be an integer >= 0.")
1351 for entry in self[:]:
1352 if os.path.isfile(entry) and not os.path.islink(entry):
1353 try:
1354 ageInDays = calculateFileAge(entry)
1355 ageInWholeDays = math.floor(ageInDays)
1356 if ageInWholeDays < daysOld:
1357 removed += 1
1358 self.remove(entry)
1359 except OSError:
1360 pass
1361 return removed
1362
1364 """
1365 Purges all items in the list.
1366
1367 Every item in the list will be purged. Directories in the list will
1368 I{not} be purged recursively, and hence will only be removed if they are
1369 empty. Errors will be ignored.
1370
1371 To faciliate easy removal of directories that will end up being empty,
1372 the delete process happens in two passes: files first (including soft
1373 links), then directories.
1374
1375 @return: Tuple containing count of (files, dirs) removed
1376 """
1377 files = 0
1378 dirs = 0
1379 for entry in self:
1380 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1381 try:
1382 os.remove(entry)
1383 files += 1
1384 logger.debug("Purged file [%s].", entry)
1385 except OSError:
1386 pass
1387 for entry in self:
1388 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1389 try:
1390 os.rmdir(entry)
1391 dirs += 1
1392 logger.debug("Purged empty directory [%s].", entry)
1393 except OSError:
1394 pass
1395 return (files, dirs)
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406 -def normalizeDir(path):
1407 """
1408 Normalizes a directory name.
1409
1410 For our purposes, a directory name is normalized by removing the trailing
1411 path separator, if any. This is important because we want directories to
1412 appear within lists in a consistent way, although from the user's
1413 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1414
1415 @param path: Path to be normalized.
1416 @type path: String representing a path on disk
1417
1418 @return: Normalized path, which should be equivalent to the original.
1419 """
1420 if path != os.sep and path[-1:] == os.sep:
1421 return path[:-1]
1422 return path
1423
1424
1425
1426
1427
1428
1429 -def compareContents(path1, path2, verbose=False):
1430 """
1431 Compares the contents of two directories to see if they are equivalent.
1432
1433 The two directories are recursively compared. First, we check whether they
1434 contain exactly the same set of files. Then, we check to see every given
1435 file has exactly the same contents in both directories.
1436
1437 This is all relatively simple to implement through the magic of
1438 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1439 off the front of each entry in the mapping it generates. This makes our
1440 comparison as simple as creating a list for each path, then generating a
1441 digest map for each path and comparing the two.
1442
1443 If no exception is thrown, the two directories are considered identical.
1444
1445 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1446 used so that any thrown exception can indicate exactly which file caused the
1447 comparison to fail. The thrown C{ValueError} exception distinguishes
1448 between the directories containing different files, and containing the same
1449 files with differing content.
1450
1451 @note: Symlinks are I{not} followed for the purposes of this comparison.
1452
1453 @param path1: First path to compare.
1454 @type path1: String representing a path on disk
1455
1456 @param path2: First path to compare.
1457 @type path2: String representing a path on disk
1458
1459 @param verbose: Indicates whether a verbose response should be given.
1460 @type verbose: Boolean
1461
1462 @raise ValueError: If a directory doesn't exist or can't be read.
1463 @raise ValueError: If the two directories are not equivalent.
1464 @raise IOError: If there is an unusual problem reading the directories.
1465 """
1466 try:
1467 path1List = BackupFileList()
1468 path1List.addDirContents(path1)
1469 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1470 path2List = BackupFileList()
1471 path2List.addDirContents(path2)
1472 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1473 compareDigestMaps(path1Digest, path2Digest, verbose)
1474 except IOError as e:
1475 logger.error("I/O error encountered during consistency check.")
1476 raise e
1477
1479 """
1480 Compares two digest maps and throws an exception if they differ.
1481
1482 @param digest1: First digest to compare.
1483 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1484
1485 @param digest2: Second digest to compare.
1486 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1487
1488 @param verbose: Indicates whether a verbose response should be given.
1489 @type verbose: Boolean
1490
1491 @raise ValueError: If the two directories are not equivalent.
1492 """
1493 if not verbose:
1494 if digest1 != digest2:
1495 raise ValueError("Consistency check failed.")
1496 else:
1497 list1 = UnorderedList(list(digest1.keys()))
1498 list2 = UnorderedList(list(digest2.keys()))
1499 if list1 != list2:
1500 raise ValueError("Directories contain a different set of files.")
1501 for key in list1:
1502 if digest1[key] != digest2[key]:
1503 raise ValueError("File contents for [%s] vary between directories." % key)
1504