#!/usr/bin/perl
#
# Script to check and verify the content files.
# http://256.com/gray/docs/content_based_backup/
#
# Copyright 2006 by Gray Watson
#
# Permission to use, copy, modify, and distribute this software for
# any purpose and without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies, and that the name of Gray Watson not be used in advertising
# or publicity pertaining to distribution of the document or software
# without specific, written prior permission.
#
# Gray Watson makes no representations about the suitability of the
# software described herein for any purpose.  It is provided "as is"
# without express or implied warranty.
#
# The author may be contacted via http://256.com/gray/
#
# $Id: check_content.pl,v 1.16 2010-06-13 03:04:32 gray Exp $
#

use strict;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Digest::SHA;
use Fcntl ':mode';
use File::Copy;
use File::Find;
use File::stat;
use Getopt::Long;
use IO::Handle;
use DBI;

# name of the database that holds our backup information, overriden with -b
my $database_name = "backup";

# host that the database is running on, overriden with -h
my $database_host = "localhost";

# port that the database is running on, overriden with -p
my $database_port = 5433;

# database type part of the dbi URI
my $database_type = "Pg";

# username to use to connect to the database, overriden with -u
my $database_username = "backup";

# password to use to connect to the database, overriden with -P
my $database_password = "";

###############################################################################
#
# USAGE MESSAGE:
sub usage {
  my ($arg) = @_;
  print STDERR

qq[$0: invalid argument usage: $arg
Usage: $0 [-c dir] [-d database] [-D] [-h host] [-m dir] [-p db-port] [-P password]
    [-s] [-t db-type] [-u username] [-U yes/no]

    -c dir         Content directory.  No default.
    -d database    Database to connect to.
    -D             Turn on debug output.
    -h host        Host that is serving the database.  Default: $database_host
    -m dir         Move deleted files into this directory.
    -p port        Port to connect to the database.  Default: $database_port
    -P password    Password to use to connect to the database.
    -s             Check signatures.  This takes a LONG time.
    -t db-type     Type portion of the database URI.  Default: $database_type
    -u username    Username to use to connect to the database.
    -U yes/no      Unlink the files directly.  Must be -U yes.  -m is better.
];
  exit 1;
}
#
###############################################################################
#
# BACKGROUND:
#
# This script walks through the directory of files organized by
# signature and verifies that they are a member of some backup that
# was created by the backup script.  If not they are moved into
# another hierarchy (see -m) or deleted (see -U).
#
# If you use the -m option then you will need to ispect and then
# remove the subdirectories.  -m is recommended so in case some
# problem happens you can make sure that a small percentage of your
# files are to be unlinked.
#
# The -s option increases the runtime significantly because the
# signature of every file will be calculated.  This can slow the
# script down by a 10x factor and should probably be done less often.
#
###############################################################################
#
# NORMAL USAGE:
#
#    check_content.pl -c /backup/host/CONTENT -m /backup/host/ORPHANED -s
#
# This will check the various content files in the
# /backup/host/CONTENT directory.  For each file found it will look it
# up in the SQL database.  If it exists then it will check the signature
# of the file (-s) to make sure it matches.  If the file does not exist
# in the database then it will be moved into the ORPHANED directory for
# possible future removal.
#
###############################################################################

#
# some constants that can be configured with runtime args
#

# no default specified so you are forced to enter one
my $content_dir;

# postgres connection
my $db_conn;

my $debug_b = 0;
my $unlink_b = 0;
my $check_sig_b = 0;
my $move_dir;

# some stats
my $file_found_c = 0;
my $file_match_c = 0;
my $file_diff_c = 0;
my $orphan_file_c = 0;
my $orphan_byte_c = 0;
my $ok_byte_c = 0;
my $invalid_content_c = 0;
my $wrong_size_c = 0;

#
# return a pretty representation of size 
#
sub size_string
{
  my ($size) = @_;
  
  if ($size > 1024 * 1024 * 1024) {
    $size /= 1024 * 1024 * 1024;
    return sprintf("%.1fg", $size);
  }
  elsif ($size > 1024 * 1024) {
    $size /= 1024 * 1024;
    return sprintf("%.1fm", $size);
  }
  elsif ($size > 1024) {
    $size /= 1024;
    return sprintf("%.1fk", $size);
  }
  else {
    return "${size}b";
  }
}

#
# write an unknown file into the orphaned hierarchy
#
sub move_file
{
  my ($file, $sig) = @_;
  return unless $move_dir;
  
  # now try to store it in our content hierarchy
  die "File checksum in an invalid form\n" unless $sig =~ m/^(..)(..)(.*)$/;
  my ($lev1, $lev2, $rest) = ($1, $2, $3); 
  my $move_path = "$move_dir/$lev1/$lev2/$rest";
  
  if (not rename($file, $move_path)) {
    
    # make any directories we will need if this fails
    my $path = "$move_dir/$lev1";
    if (not mkdir($path)) {
      die "Could not mkdir '$path': $!\n" unless $!{EEXIST};
    }
    $path .= "/$lev2";
    if (not mkdir($path)) {
      die "Could not mkdir '$path': $!\n" unless $!{EEXIST};
    }
    die "Could not rename '$file' to '$move_path': $!\n"
      unless rename($file, $move_path);
  }
  
  print " moved to '$move_dir'\n" if $debug_b;
}

#
# process a file by checking it with the database
#
sub process_file
{
  my ($file, $sb) = @_;
  if ($file !~ m,/(..)/(..)/((...*)(.gz))?$,) {
    print "ERROR: Unknown content file form for: $file\n";
    $invalid_content_c++;
    return;
  }
  print "Checking file $file\n" if $debug_b;
  
  my $sig_path = "$1$2$4";
  my $sig_full = "$1$2$3";
  
  # if not in a backup then either unlink or move to deleted hierarchy
  # read it in (if flags set) and verify its checksum matches
  
  # verify that the file is in a backup
  my $match = $db_conn->selectrow_hashref(qq{
    SELECT * FROM files WHERE content='$sig_path' LIMIT 1;
  });
  
  if (not $match) {
    my $size_str = size_string($sb->size);
    $orphan_file_c++;
    $orphan_byte_c += $sb->size;
    if ($move_dir) {
      move_file($file, $sig_full);
    }
    elsif ($unlink_b) {
      if (unlink($file)) {
	print " orphan file unlinked: $size_str\n" if $debug_b;
      } else {
	warn "Could not unlink $file: $!";
      }
    }
    else {
      print " not found -- left as orphan: $size_str\n" if $debug_b;
    }
    return;
  }
  $ok_byte_c += $sb->size;
  
  print " file in backup $match->{backup}\n" if $debug_b;
  $file_found_c++;
  
  return unless $check_sig_b;
  
  # this should succeed
  my $GZIP = new IO::Uncompress::Gunzip($file)
    || die "Could not open gzip file '$file': $!\n";
  my $sig = Digest::SHA->new(256);
  my $file_size = 0;
  while (1) {
    # NOTE: we can not use sysread here because of gzip stream
    my $read_size = read($GZIP, my $buf, 10240);
    die unless defined($read_size);
    last unless $read_size;
    $file_size += $read_size;
    $sig->add($buf);
  }
  close($GZIP) || die "Problems running gzip on file '$file': $!\n";
  my $sig_check = $sig->hexdigest();
  
  # verify the checksums
  if ($sig_path eq $sig_check) {
    print " signature path matches contents\n" if $debug_b;
    $file_match_c++;
  }
  else {
    print "ERROR: file '$file' != content sig $sig_check\n";
    $file_diff_c++;
  }
  
  # make sure the file size matches the db
  if ($file_size != $match->{size}) {
    print "ERROR: file '$file' size $file_size != db size " . $match->{size}
      . "\n";
    $wrong_size_c++;
  }
}

#
# for each directory entry, process it
#
sub found_dir
{
  my (@entries) = @_;
  my @continue = ();
  
  DIRENT: foreach my $dirent (@entries) {
    next if ($dirent eq '.' || $dirent eq '..');
    
    # make our file
    my $path = $File::Find::dir . "/$dirent";
    # trim // to /
    $path =~ s,//,/,g;
    
    my $sb = lstat($path);
    if (not $sb) {
      # This happens when we stat the directory and then a file is
      # removed.  Maybe a mail queue directory or other temporary
      # space.
      warn "Could not stat $path: $!\n";
      next;
    }
    my $mode = $sb->mode;
    
    # find the file type
    if (S_ISREG($mode)) {
      # for a file
      process_file($path, $sb);
    }
    elsif (S_ISDIR($mode)) {
      # continue down into this
      push(@continue, $dirent);
    }
    elsif (S_ISLNK($mode)) {
      print " Skipping symlink $path\n" if $debug_b;
    }
    elsif (S_ISBLK($mode), S_ISCHR($mode)) {
      print " Skipping device $path\n" if $debug_b;
    }
    else {
      print " Skipping unknown file $path\n" if $debug_b;
    }
  }
  
  return @continue;
}

###############################################################################

my $usage_b = 0;
my $unlink_str;

GetOptions("content|c=s" => \$content_dir,
	   "database|d=s" => \$database_name,
	   "debug|D" => \$debug_b,
	   "host|h=s" => \$database_host,
	   "move-dir|m=s" => \$move_dir,
	   "port|p=s" => \$database_port,
	   "password|P=s" => \$database_password,
	   "signatures|s" => \$check_sig_b,
	   "type|t=s" => \$database_type,
	   "username|u=s" => \$database_username,
	   "unlink|U=s" => \$unlink_str,
	   "help|usage" => \$usage_b,
	   ) || usage();
usage() if $usage_b;
die "Must specify a content directory (-c)\n" unless $content_dir;
die "Content directory '$content_dir' is not a directory\n" unless -d $content_dir;
if ($unlink_str) {
  if ($unlink_str eq "yes") {
    $unlink_b = 1;
  }
  else {
    die "The -U option must have a 'yes' argument.";   
  }
}

print "NOTE: no orphaned files will be moved or unlinked\n" unless ($move_dir || $unlink_b);

# connect to the DB if not done already (speedy-cgi)
$db_conn = DBI->connect("dbi:$database_type:dbname=$database_name;host=$database_host"
			. ";port=$database_port",
			$database_username, $database_password,
			{ RaiseError => 0, PrintError => 0 });
if (not $db_conn) {
    my $errstr = $DBI::errstr;
    die "Could not connect to $database_host:$database_port database $database_name: $errstr";
}

print "Checking out content directory $content_dir\n";
print "Started at " . scalar(localtime) . "\n";
  
# process our directories
find({ 'no_chdir' => 1, 'preprocess' => \&found_dir, 'wanted' => sub {} }, ( $content_dir ));

my $ok_size = size_string($ok_byte_c);
my $orphan_size = size_string($orphan_byte_c);

print "Finished at " . scalar(localtime) . "\n";
print qq[      Checked Content:
         Found Files: $file_found_c
          Found Size: $ok_size ($ok_byte_c)
      Orphaned Files: $orphan_file_c
       Orphaned Size: $orphan_size ($orphan_byte_c)
File Invalid Content: $invalid_content_c
          Wrong Size: $wrong_size_c
];
if ($check_sig_b) {
  print qq[
        Matching Sig: $file_match_c
       Different Sig: $file_diff_c
];
}

