#!/usr/bin/perl # # Content Based Backup/Archive Script # http://256.com/gray/docs/content_based_backup/ # # Copyright 2006 by Gray Watson # # Permission to use, copy, modify, and distribute this software for # any purpose and without fee is hereby granted, provided that the # above copyright notice and this permission notice appear in all # copies, and that the name of Gray Watson not be used in advertising # or publicity pertaining to distribution of the document or software # without specific, written prior permission. # # Gray Watson makes no representations about the suitability of the # software described herein for any purpose. It is provided "as is" # without express or implied warranty. # # The author may be contacted via http://256.com/gray/ # # $Id: backup.pl,v 1.42 2007/03/19 18:38:39 gray Exp $ # ############################################################################### # SEE USAGE DOCUMENTATION BELOW ############################################################################### # # BACKGROUND: # # This is a content-based backup script that uses a SQL database and a # directory of files to backup systems. The files are managed by # their digest signature (SHA256) and stored in the content-directory # using the signature as path. The mechanism means that files with # the same signature are stored once, conserving space. # # The script requires a content directory where the backup file # hierarchy will be located. You can either change the default in the # script to match your system or specify the directory using the '-c # dir' argument. The script also requires a list of directories to # backup. They can be specified on the command line or in a file with # '-b file'. # # WARNING: The script does _not_ cross device boundaries (unless -x is # used) so each partition must be specified separately. # # WARNING: this script only backups up directories, symbolic links, # character/block devices, and files. It ignores named pipes and other # file types. # ############################################################################### # # This script uses a SQL database to record the various file types. # Directories, nodes, and symlinks are duplicated with appropriate # modes, uid, and gid. Files are a reference to their digest # signature which refers to a the content directory. The exception is # 0 length files which are copied into the backup-directory. This was # an optimization because there are many 0 length files and managing # them all started to get expensive. Hard links are not preserved. # # The content directory is organized around digest signatures of all # of the files. The first 2 levels of directory are the 1st 2 bytes # from the digest signature of the file being backed up. At the 3rd # level are files with the rest of the signature which are copies of # the files that were backed up. The file could be there or it could # be a compressed .gz version of the file. # ############################################################################### # # NORMAL USAGE: # # backup.pl -c /backup/host/CONTENT / # # This will backup the / filesystem. It will copy the files from # these filesystems into the /backup/host/CONTENT directory organized # by signatures and it will add to the backups SQL database. # # So if your / contained: # # /foo (file containing 'hello there') # /bar -> baz (bar is a symbolic link to baz) # /baz/ (directory) # /baz/foo2 (file containing 'wow, lookie') # /baz/foo3 (0 length file) # # The /backup/host/CONTENT directory would contain the following where the # hex numbers are parts of the digest signature. # # 2d/01/d5d9c24034d54............... # (/foo) # af/44/71c1274fc260a............... # (/baz/foo2) # # The 'files' database table would contain 5 entries for the 5 files in /. # ############################################################################### # # INCREMENTAL VERSUS FULL BACKUPS: # # Full backups are the default. Incrementals can be specified with # the -i option. Incrementals check the last backup date/time and # only try to backup files that are newer. It is an time optimization # only since the same number of bytes should be backed up, either way. # The more backups you make, the more different snapshots of files # (log files for example) you will have. If you remove backup # directories, you can run through the content directory and remove # files that are only pointed to by that backup. # ############################################################################### # # some constants that can be configured with runtime args # # hostname of system being backed up, overriden with -m chomp(my $hostname = `hostname -s`); # directory of content files, overriden with -c my $content_dir = "/usr2/backup/$hostname/CONTENT"; # name of the database that holds our backup information, overriden with -b my $database_name = "backup"; # port that the database is running on, overriden with -p my $database_port = 5433; # username to use to connect to the database, overriden with -u my $database_username = "backup"; # password to use to connect to the database, overriden with -P my $database_password = ""; ############################################################################### # # USAGE MESSAGE # sub usage { my($arg) = @_; print STDERR qq[$0: invalid argument usage: $arg Usage: $0 [-b file] [-c dir] [-d database] [-D] [-e file] [-i] [-I path] [-l label] [-m machine] [-n] [-p db-port] [-P password] [-q] [-u username] [-w MM/DD/YYYY] [-x] dir1 ... -b file File containing directories to backup. -c dir Content directory. Default: $content_dir -d database Database to connect to. Default: $database_name -D Turn on debug output. -e file File containing file/directory regex patterns to exclude. -i Only backup files that were modified since last full backup. -I path Initial part of path if backing up relative directory. -l label Name of collection for backup information. -m machine Name of machine being backed up. Default: $hostname -n Dry run. No change to disk or database. -p port Port to connect to the database. Default: $database_port -P password Password to use to connect to the database. -q Quiet. No messages printed. -u username Username to connect to the database. -w MM/DD/YYYY If the backup is not for today then set the date -x Cross device boundaries. Default is not to. dir1 ... List of directories to backup if -b file not specified. ]; exit 1; } use strict; use Digest::SHA2; use Fcntl ':mode'; use PerlIO::gzip; use File::Find; use File::stat; use Getopt::Long; use IO::Handle; use POSIX qw(strftime); use Unix::Mknod qw{makedev mknod major minor}; use DBI; # postgres connection my $db_conn; my $backup_id = 0; my $cross_dev_b = 0; my $debug_b = 0; my $quiet_b = 0; my $no_changes_b = 0; my @exclude_patterns; my $since_time; my $initial_path; # some stats my $back_device_c = 0; my $back_dir_c = 0; my $back_link_c = 0; my $file_new_byte_c = 0; my $file_new_gzip_c = 0; my $file_new_c = 0; my $file_dup_byte_c = 0; my $file_dup_c = 0; my $back_zero_c = 0; my $exclude_path_c = 0; my $external_dev_c = 0; # # return a pretty representation of size # sub size_string { my ($size) = @_; if ($size > 1024 * 1024 * 1024) { $size /= 1024 * 1024 * 1024; return sprintf("%.1fg", $size); } elsif ($size > 1024 * 1024) { $size /= 1024 * 1024; return sprintf("%.1fm", $size); } elsif ($size > 1024) { $size /= 1024; return sprintf("%.1fk", $size); } else { return "${size}b"; } } # # write an entry into the db # sub write_sql { my ($table, $entry) = @_; return if $no_changes_b; # modified gets set to the current time my $sql = "INSERT INTO $table (\"" . join('", "', map { $_ } keys(%$entry)) . '") VALUES (' . join(', ', map { "?" } keys(%$entry)) . ");"; my $stmt = $db_conn->prepare($sql); if (not $stmt) { my $errstr = $db_conn->errstr; die("problem preparing sql '$sql' for '$table': $errstr"); } my @values = map { $entry->{$_} } keys(%$entry); if ($stmt->execute(@values) != 1) { my $errstr = $stmt->errstr; if ($errstr) { die("insert error on '$sql' for '$table': $errstr"); } else { die("backup insert for '$table' affected 0 rows"); } } } # # write a new file into the CONTENT hierarchy # sub write_new_file { my ($file, $entry, $sb) = @_; return if $no_changes_b; # Now try to store it in our content hierarchy. Since the file may # have already changed (logs, db files), we need to copy it and # recalculate its signature into a temporary file and then rename # it into place. # copy the file from filesystem into content hierarchy while compressing my $FILE; if (not open($FILE, '<', $file)) { # Race condition here we could have just missed a temp file. # Maybe a mail queue directory or other temporary space. warn "Could not open $file: $!\n"; return; } # copy the file and then rename it to avoid partial files from crashes my $GZIP; # this should succeed my $tmp_path = "$content_dir/tmp_$$.gz"; open($GZIP, ">:gzip", $tmp_path) || die "Could not open gzip file '$tmp_path': $!\n"; my $sig_calc = Digest::SHA2::new(256); my $read_total = 0; while (1) { my $size = read($FILE, my $buf, 102400); die unless defined($size); last unless $size; $read_total += $size; $sig_calc->add($buf); # we can not use syswrite here print $GZIP $buf; } close($GZIP); close($FILE); my $sig = $sig_calc->hexdigest(); die "File checksum in an invalid form\n" unless $sig =~ m/^(..)(..)(.*)$/; my ($lev1, $lev2, $rest) = ($1, $2, $3); my $content_path = "$content_dir/$lev1/$lev2/$rest.gz"; # now rename the file into place if (not rename($tmp_path, $content_path)) { # make any directories we will need if this fails my $path = "$content_dir/$lev1"; if (not mkdir($path)) { die "Could not mkdir '$path': $!\n" unless $!{EEXIST}; } $path .= "/$lev2"; if (not mkdir($path)) { die "Could not mkdir '$path': $!\n" unless $!{EEXIST}; } # try the rename again rename($tmp_path, $content_path) || die "Could not rename file from '$tmp_path' to '$content_path': $!\n"; } # get the gzipped file size $file_new_gzip_c += (stat($content_path))->size; $file_new_c++; $file_new_byte_c += $read_total; # get our mode and tune it for the content file my $mode = $sb->mode; # turn off execution and special bits $mode &= 0666; chmod($mode, $content_path); # sync the uid/gid to match the original chown($sb->uid, $sb->gid, $content_path); if ($debug_b) { print " new file with signature $sig ($read_total bytes)\n"; } else { print " new $file with signature $sig ($read_total bytes)\n" unless $quiet_b; } $entry->{content} = $sig; $entry->{size} = $read_total; } # # process a file entry by possibly backing it up # sub process_file { my ($file, $sb, $entry) = @_; # incremental backup? if ($since_time && $sb->mtime < $since_time) { # XXX: so we could do a query for the last backup which matches # the path here. If it matches in the db and its mtime and size # is the same then just copy the content signature and write the # entry. If it does not exist then maybe create it as a new file. print "Skipping older file $file\n" if $debug_b; return; } print "Backing up file $file\n" if $debug_b; # We special case a the 0 sized file and write a file into the # backup directory. We do this because the 0 length content file # got overloaded since every system filesystem has 1000s of them. if ($sb->size == 0) { # zero length file $entry->{type} = 5; # no need to write content since it is always the same $back_zero_c++; $entry->{path} = $initial_path . $entry->{path} if $initial_path; write_sql("files", $entry); return; } # Read in a file and get its hex signature. So since we have to # read it in again to copy it into the backup hierarchy, I wonder if # it makes sense to copy it into memory if it is small or something. # Probably does not make sense to copy it to the backup partition # now. Waste of writes most likely. open(my $FILE, '<', $file) || die "Could not open $file: $!\n"; my $sig = Digest::SHA2::new(256); $sig->addfile($FILE); close($FILE); my $sig = $sig->hexdigest(); # Look for a matching content in another backup directory. # NOTE: it might be cheaper to use the filesystem to find a match my $match = $db_conn->selectrow_hashref(qq{ SELECT * FROM files WHERE content='$sig' LIMIT 1; }); # now we check to make sure that the file exists die "File checksum in an invalid form\n" unless $sig =~ m/^(..)(..)(.*)$/; my ($lev1, $lev2, $rest) = ($1, $2, $3); my $content_path = "$content_dir/$lev1/$lev2/$rest.gz"; $entry->{content} = $sig; # if we had a match and a file exists if ($match && -f $content_path) { print " content matched from backup $match->{backup}\n" if $debug_b; $file_dup_c++; $file_dup_byte_c += $sb->size; # could open and verify the zlib size header } else { print " ERROR: backup match for sig '$sig' but no file\n" if $match; # write file into content hierarchy, returns actual written signature # which might be different write_new_file($file, $entry, $sb); # NOTE: entry content and size may have been adjusted by write_new_file } # non-zero length normal file $entry->{type} = 1; $entry->{path} = $initial_path . $entry->{path} if $initial_path; write_sql("files", $entry); } # # process a directory entry # sub process_dir { my ($dir, $sb, $entry) = @_; print "Backing up dir $dir\n" if $debug_b; $entry->{type} = 2; $entry->{path} = $initial_path . $entry->{path} if $initial_path; write_sql("files", $entry); $back_dir_c++; } sub process_symlink { my ($link, $entry) = @_; print "Backing up symlink $link\n" if $debug_b; # just get the link destination $entry->{type} = 3; $entry->{linkpath} = readlink($link) || die "Could not readlink $link: $!\n"; $entry->{path} = $initial_path . $entry->{path} if $initial_path; write_sql("files", $entry); $back_link_c++; } sub process_device { my ($device, $sb, $entry) = @_; print "Backing up device $device\n" if $debug_b; # device node $entry->{type} = 4; $entry->{major} = major($sb->rdev); $entry->{minor} = minor($sb->rdev); $entry->{path} = $initial_path . $entry->{path} if $initial_path; write_sql("files", $entry); $back_device_c++; } # # for each directory entry, process it # sub found_dir { my (@entries) = @_; my @continue = (); # get the current dir my $dot_dir = $File::Find::dir; my $dot_sb = stat($dot_dir) || die "Could not stat $dot_dir: $!\n"; DIRENT: foreach my $dirent (@entries) { next if ($dirent eq '.' || $dirent eq '..'); # make our file my $path = $File::Find::dir . "/$dirent"; # trim // to / $path =~ s,//,/,g; # any matches to excluded patterns foreach my $pat (@exclude_patterns) { if ($path =~ $pat) { print " skipped excluded path '$path'\n" unless $quiet_b; $exclude_path_c++; next DIRENT; } } my $sb = lstat($path); if (not $sb) { # This happens when we stat the directory and then a file is # removed. Maybe a mail queue directory or other temporary # space. warn "Could not stat $path: $!\n"; next; } my $mode = $sb->mode; my $entry; $entry->{path} = $path; # hack to load in previous backups $entry->{path} =~ s,^./,/, if $entry->{path} =~ m,^./,; $entry->{mode} = $sb->mode; $entry->{owner} = (getpwuid($sb->uid))[0]; # oh well, just store the uid $entry->{owner} = $sb->uid unless $entry->{owner}; $entry->{group} = (getgrgid($sb->gid))[0]; # oh well, just store the gid $entry->{group} = $sb->gid unless $entry->{group}; $entry->{mtime} = strftime("%m/%d/%Y %H:%M:%S", localtime($sb->mtime)); $entry->{ctime} = strftime("%m/%d/%Y %H:%M:%S", localtime($sb->ctime)); $entry->{backup} = $backup_id; $entry->{size} = $sb->size; # find the file type if (S_ISREG($mode)) { # for a file process_file($path, $sb, $entry); } elsif (S_ISDIR($mode)) { # make our directory entry process_dir($path, $sb, $entry); # only go into this directory if cross-mounts allowed or same device if ($cross_dev_b || $sb->dev == $dot_sb->dev) { push(@continue, $dirent); } else { # already made the directory entry but do not go down into it print " Skipping other device $path\n" unless $quiet_b; $external_dev_c++; } } elsif (S_ISLNK($mode)) { process_symlink($path, $entry); } elsif (S_ISBLK($mode) || S_ISCHR($mode)) { process_device($path, $sb, $entry); } elsif (S_ISFIFO($mode) || S_ISSOCK($mode)) { # these are combined in the stat flags print " Skipping fifo or socket typed file: $path\n" unless $quiet_b; } elsif (S_ISWHT($mode)) { print " Skipping whiteout (??) typed file: $path\n" unless $quiet_b; } else { # NOTE: we ignore name-pipes, sockets, block/char special print " Skipping file of unknown type: $path\n" unless $quiet_b; } } return @continue; } ############################################################################### my $dir_file; my $exclude_file; my $collection_label; my $machine; my $usage_b = 0; my $incremental_b = 0; my $when_date; # allow -in (as -i, -n) and make -i be case sensitive Getopt::Long::Configure ("bundling"); GetOptions("backup|b=s" => \$dir_file, "content|c=s" => \$content_dir, "database|d" => \$database_name, "debug|D" => \$debug_b, "exclude|e=s" => \$exclude_file, "incremental|i" => \$incremental_b, "initial-path|I=s" => \$initial_path, "label|l=s" => \$collection_label, "machine|m=s" => \$machine, "no-changes|n" => \$no_changes_b, "port|p=s" => \$database_port, "password|P=s" => \$database_password, "quiet|q" => \$quiet_b, "username|u=s" => \$database_username, "when|w=s" => \$when_date, "cross|x" => \$cross_dev_b, "help|usage" => \$usage_b, ) || usage(); usage() if $usage_b; die "Must specify a content directory (-c)\n" unless $content_dir; die "Content directory '$content_dir' is not a directory\n" unless -d $content_dir; die "Must specify a collection label (-l)\n" unless $collection_label; print "No changes made to disk.\n" if $no_changes_b; die "Cannot specify --quiet (-q) and --debug (-D)\n" if ($debug_b && $quiet_b); # read directories to backup from a file if ($dir_file) { open(DIRS, '<', $dir_file) || die "Could not open $dir_file: $!\n"; while () { chomp; next if (m/^\#/ || m/^$/); push @ARGV, $_; } close(DIRS); } # read directories to backup from a file if ($exclude_file) { open(DIRS, '<', $exclude_file) || die "Could not open $exclude_file: $!\n"; while () { chomp; next if (m/^\#/ || m/^$/); push @exclude_patterns, qr{$_}; } close(DIRS); } # connect to the DB if not done already (speedy-cgi) $db_conn = DBI->connect("dbi:Pg:dbname=$database_name;port=$database_port", $database_username, $database_password, { RaiseError => 0, PrintError => 0, AutoCommit => 1 }); if (not $db_conn) { my $errstr = $DBI::errstr; die "Unable to connect to database '$database_name': $errstr\n"; } # if incremental, lookup last backup if ($incremental_b) { # lookup the last full backup $since_time = $db_conn->selectrow_array(qq{ SELECT EXTRACT(EPOCH FROM created) FROM backups WHERE "collection" = '$collection_label' AND "full" = true ORDER BY "created" DESC LIMIT 1 }); die "Could not find full backup matching collection '$collection_label'\n" unless $since_time; } if ($no_changes_b) { # we lookup the current value and add 1 to it but do not change the db $backup_id = $db_conn->selectrow_array(qq{ SELECT last_value FROM backup_id_seq; }); $backup_id++ if defined $backup_id; } else { # get the next id from our sequence $backup_id = $db_conn->selectrow_array(qq{ SELECT nextval('backup_id_seq'); }); } if (not defined $backup_id) { my $errstr = $db_conn->errstr; die "Could not get the next backup-id: $errstr\n"; } usage("directories to backup") unless @ARGV; print "Writing to backup id $backup_id at " . scalar(localtime) . "\n" unless $quiet_b; my $backup; # update our when-date if necessary $when_date = strftime("%m/%d/%Y %H:%M:%S", localtime) unless $when_date; $backup->{created} = $when_date; $backup->{id} = $backup_id; $machine = $hostname unless $machine; $backup->{machine} = $machine; $backup->{collection} = $collection_label; $backup->{full} = ($incremental_b ? 'false' : 'true'); # write the entry about the backup write_sql("backups", $backup); # set our umask to 0 umask(0); my $backup_start = time; # run through each directory argument foreach my $dir (@ARGV) { print "Backing up directory $dir\n" unless $quiet_b; # run through each directory and create the initial path components if ($dir ne '.') { my $run_dir; foreach my $dirpart (split("/", $dir)) { if ($run_dir eq '/') { $run_dir .= $dirpart; } else { $run_dir .= "/$dirpart"; } die "Backup dir '$dir' cannot have a symlink component\n" if -l $run_dir; my $sb = lstat($run_dir) || die "Could not stat $run_dir\n"; my $entry; $entry->{path} = $run_dir; $entry->{mode} = $sb->mode; $entry->{owner} = (getpwuid($sb->uid))[0]; # oh well, just store the uid $entry->{owner} = $sb->uid unless $entry->{owner}; $entry->{group} = (getgrgid($sb->gid))[0]; # oh well, just store the gid $entry->{group} = $sb->gid unless $entry->{group}; $entry->{mtime} = strftime("%m/%d/%Y %H:%M:%S", localtime($sb->mtime)); $entry->{ctime} = strftime("%m/%d/%Y %H:%M:%S", localtime($sb->ctime)); $entry->{backup} = $backup_id; $entry->{size} = $sb->size; process_dir($run_dir, $sb, $entry); } } # process our directories find({ 'no_chdir' => 1, 'preprocess' => \&found_dir, 'wanted' => sub {} }, ( $dir )); } # spit out stats my $file_new_size = size_string($file_new_byte_c); my $file_new_gzip_size = size_string($file_new_gzip_c); my $file_dup_size = size_string($file_dup_byte_c); my $time_diff = time - $backup_start; $time_diff = 1 unless $time_diff; my $time_string = sprintf "%02d:%02d:%02d", int($time_diff / 3600), int(($time_diff % 3600) / 60), ($time_diff % 60); my $bytes_sec = int($file_new_byte_c / $time_diff); my $bytes_sec_size = size_string($bytes_sec); if (not $quiet_b) { print "Finished backup at " . scalar(localtime) . "\n"; print qq[Backed up: Added Files: $file_new_c Added Size: $file_new_size ($file_new_byte_c) Gzipd Size: $file_new_gzip_size ($file_new_gzip_c) Duplicate Files: $file_dup_c Duplicate Size: $file_dup_size ($file_dup_byte_c) Zero Files: $back_zero_c Directories: $back_dir_c Devices: $back_device_c Sym Links: $back_link_c Excluded Paths: $exclude_path_c External Devs: $external_dev_c Bytes/Sec: $bytes_sec_size ($bytes_sec) in $time_string ($time_diff sec) ]; } # # Update the backups db entry with some stats # my $entry; $entry->{new_files} = $file_new_c; $entry->{new_size} = $file_new_byte_c; $entry->{gzip_size} = $file_new_gzip_c; $entry->{dup_files} = $file_dup_c; $entry->{dup_size} = $file_dup_byte_c; $entry->{directories} = $back_dir_c; $entry->{symlinks} = $back_link_c; $entry->{devices} = $back_device_c; $entry->{zero_files} = $back_zero_c; $entry->{duration} = "$time_diff seconds"; # modified gets set to the current time my $sql = "UPDATE backups SET " . join(', ', map { "$_ = ?" } keys(%$entry)) . "WHERE \"id\"='$backup_id'"; my $stmt = $db_conn->prepare($sql); if (not $stmt) { my $errstr = $db_conn->errstr; die("problem preparing sql '$sql' for 'backups': $errstr"); } my @values = map { $entry->{$_} } keys(%$entry); if ($stmt->execute(@values) != 1) { my $errstr = $stmt->errstr; if ($errstr) { die "Could not update backups with stats via '$sql': $errstr\n"; } else { die "Updating backups with stats affected 0 rows via '$sql'\n"; } }