#!/usr/bin/perl # # Script to check and verify the content files. # http://256.com/gray/docs/content_based_backup/ # # Copyright 2006 by Gray Watson # # Permission to use, copy, modify, and distribute this software for # any purpose and without fee is hereby granted, provided that the # above copyright notice and this permission notice appear in all # copies, and that the name of Gray Watson not be used in advertising # or publicity pertaining to distribution of the document or software # without specific, written prior permission. # # Gray Watson makes no representations about the suitability of the # software described herein for any purpose. It is provided "as is" # without express or implied warranty. # # The author may be contacted via http://256.com/gray/ # # $Id: check_content.pl,v 1.12 2006/06/08 21:26:53 gray Exp $ # use strict; use Digest::SHA2; use Fcntl ':mode'; use PerlIO::gzip; use File::Copy; use File::Find; use File::stat; use Getopt::Long; use IO::Handle; use DBI; ############################################################################### # # USAGE MESSAGE: sub usage { my ($arg) = @_; print STDERR qq[$0: invalid argument usage: $arg Usage: $0 [-c dir] [-d database] [-D] [-m dir] [-p db-port] [-P password] [-s] [-u username] [-U yes/no] -c dir Content directory. No default. -d database Database to connect to. -D Turn on debug output. -m dir Move deleted files into this directory. -p port Port to connect to the database. -P password Password to use to connect to the database. -s Check signatures. This takes a LONG time. -u username Username to use to connect to the database. -U yes/no Unlink the files directly. Must be -U yes. -m is better. ]; exit 1; } # ############################################################################### # # BACKGROUND: # # This script walks through the directory of files organized by # signature and verifies that they are a member of some backup that # was created by the backup script. If not they are moved into # another hierarchy (see -m) or deleted (see -U). # # If you use the -m option then you will need to ispect and then # remove the subdirectories. -m is recommended so in case some # problem happens you can make sure that a small percentage of your # files are to be unlinked. # # The -s option increases the runtime significantly because the # signature of every file will be calculated. This can slow the # script down by a 10x factor and should probably be done less often. # ############################################################################### # # NORMAL USAGE: # # check_content.pl -c /backup/host/CONTENT -m /backup/host/ORPHANED -s # # This will check the various content files in the # /backup/host/CONTENT directory. For each file found it will look it # up in the SQL database. If it exists then it will check the signature # of the file (-s) to make sure it matches. If the file does not exist # in the database then it will be moved into the ORPHANED directory for # possible future removal. # ############################################################################### # # some constants that can be configured with runtime args # # no default specified so you are forced to enter one my $content_dir; # name of the database that holds our backup information, overriden with -b my $database_name = "backup"; # port that the database is running on, overriden with -p my $database_port = 5433; # username to use to connect to the database, overriden with -u my $database_username = "backup"; # password to use to connect to the database, overriden with -P my $database_password = ""; # postgres connection my $db_conn; my $debug_b = 0; my $unlink_b = 0; my $check_sig_b = 0; my $move_dir; # some stats my $file_found_c = 0; my $file_match_c = 0; my $file_diff_c = 0; my $orphan_file_c = 0; my $orphan_byte_c = 0; my $ok_byte_c = 0; my $invalid_content_c = 0; my $wrong_size_c = 0; # # return a pretty representation of size # sub size_string { my ($size) = @_; if ($size > 1024 * 1024 * 1024) { $size /= 1024 * 1024 * 1024; return sprintf("%.1fg", $size); } elsif ($size > 1024 * 1024) { $size /= 1024 * 1024; return sprintf("%.1fm", $size); } elsif ($size > 1024) { $size /= 1024; return sprintf("%.1fk", $size); } else { return "${size}b"; } } # # write an unknown file into the orphaned hierarchy # sub move_file { my ($file, $sig) = @_; return unless $move_dir; # now try to store it in our content hierarchy die "File checksum in an invalid form\n" unless $sig =~ m/^(..)(..)(.*)$/; my ($lev1, $lev2, $rest) = ($1, $2, $3); my $move_path = "$move_dir/$lev1/$lev2/$rest"; if (not rename($file, $move_path)) { # make any directories we will need if this fails my $path = "$move_dir/$lev1"; if (not mkdir($path)) { die "Could not mkdir '$path': $!\n" unless $!{EEXIST}; } $path .= "/$lev2"; if (not mkdir($path)) { die "Could not mkdir '$path': $!\n" unless $!{EEXIST}; } die "Could not rename '$file' to '$move_path': $!\n" unless rename($file, $move_path); } print " moved to '$move_dir'\n" if $debug_b; } # # process a file by checking it with the database # sub process_file { my ($file, $sb) = @_; if ($file !~ m,/(..)/(..)/((...*)(.gz))?$,) { print "ERROR: Unknown content file form for: $file\n"; $invalid_content_c++; return; } print "Checking file $file\n" if $debug_b; my $sig_path = "$1$2$4"; my $sig_full = "$1$2$3"; # if not in a backup then either unlink or move to deleted hierarchy # read it in (if flags set) and verify its checksum matches # verify that the file is in a backup my $match = $db_conn->selectrow_hashref(qq{ SELECT * FROM files WHERE content='$sig_path' LIMIT 1; }); if (not $match) { my $size_str = size_string($sb->size); $orphan_file_c++; $orphan_byte_c += $sb->size; if ($move_dir) { move_file($file, $sig_full); } elsif ($unlink_b) { unlink($file); print " orphan file unlinked: $size_str\n" if $debug_b; } else { print " not found -- left as orphan: $size_str\n" if $debug_b; } return; } $ok_byte_c += $sb->size; print " file in backup $match->{backup}\n" if $debug_b; $file_found_c++; return unless $check_sig_b; # this should succeed open(my $GZIP, "<:gzip", $file) || die "Could not open gzip file '$file': $!\n"; my $sig = Digest::SHA2::new(256); my $file_size = 0; while (1) { # NOTE: we can not use sysread here because of gzip stream my $read_size = read($GZIP, my $buf, 10240); die unless defined($read_size); last unless $read_size; $file_size += $read_size; $sig->add($buf); } close($GZIP) || die "Problems running gzip on file '$file': $!\n"; my $sig_check = $sig->hexdigest(); # verify the checksums if ($sig_path eq $sig_check) { print " signature path matches contents\n" if $debug_b; $file_match_c++; } else { print "ERROR: file '$file' != content sig $sig_check\n"; $file_diff_c++; } # make sure the file size matches the db if ($file_size != $match->{size}) { print "ERROR: file '$file' size $file_size != db size " . $match->{size} . "\n"; $wrong_size_c++; } } # # for each directory entry, process it # sub found_dir { my (@entries) = @_; my @continue = (); DIRENT: foreach my $dirent (@entries) { next if ($dirent eq '.' || $dirent eq '..'); # make our file my $path = $File::Find::dir . "/$dirent"; # trim // to / $path =~ s,//,/,g; my $sb = lstat($path); if (not $sb) { # This happens when we stat the directory and then a file is # removed. Maybe a mail queue directory or other temporary # space. warn "Could not stat $path: $!\n"; next; } my $mode = $sb->mode; # find the file type if (S_ISREG($mode)) { # for a file process_file($path, $sb); } elsif (S_ISDIR($mode)) { # continue down into this push(@continue, $dirent); } elsif (S_ISLNK($mode)) { print " Skipping symlink $path\n" if $debug_b; } elsif (S_ISBLK($mode), S_ISCHR($mode)) { print " Skipping device $path\n" if $debug_b; } else { print " Skipping unknown file $path\n" if $debug_b; } } return @continue; } ############################################################################### my $usage_b = 0; my $unlink_str; GetOptions("content|c=s" => \$content_dir, "database|d" => \$database_name, "debug|D" => \$debug_b, "move-dir|m=s" => \$move_dir, "port|p=s" => \$database_port, "password|P=s" => \$database_password, "signatures|s" => \$check_sig_b, "username|u=s" => \$database_username, "unlink|U=s" => \$unlink_str, "help|usage" => \$usage_b, ) || usage(); usage() if $usage_b; die "Must specify a content directory (-c)\n" unless $content_dir; die "Content directory '$content_dir' is not a directory\n" unless -d $content_dir; if ($unlink_str) { if ($unlink_str eq "yes") { $unlink_b = 1; } else { die "The -U option must have a 'yes' argument."; } } # connect to the DB if not done already (speedy-cgi) $db_conn = DBI->connect("dbi:Pg:dbname=$database_name;port=$database_port", $database_username, $database_password, { RaiseError => 0, PrintError => 0 }); die "Could not connect to database $database_name at port $database_port\n" unless $db_conn; print "Checking out content directory $content_dir\n"; print "Started at " . scalar(localtime) . "\n"; # process our directories find({ 'no_chdir' => 1, 'preprocess' => \&found_dir, 'wanted' => sub {} }, ( $content_dir )); my $ok_size = size_string($ok_byte_c); my $orphan_size = size_string($orphan_byte_c); print "Finished at " . scalar(localtime) . "\n"; print qq[ Checked Content: Found Files: $file_found_c Found Size: $ok_size ($ok_byte_c) Orphaned Files: $orphan_file_c Orphaned Size: $orphan_size ($orphan_byte_c) File Invalid Content: $invalid_content_c Wrong Size: $wrong_size_c ]; if ($check_sig_b) { print qq[ Matching Sig: $file_match_c Different Sig: $file_diff_c ]; }