#!/usr/bin/perl -w

use integer;

# The following make this program more readable.
# If you replace every '$Yes' in this program with '1', etc, this program
# might run faster. And then again it might not make any difference;
# it depends on whether the compiler treats it as a constant or as a variable.
$Yes = 1;
$No = 0;
$Include = 1;
$Exclude = 0;

# These initializations of data can be skipped, since everything is
# initialized to nothing; but without these initializations there would
# be warnings about uninitialized data if perl -w is used
$CurrentListThisFileName='';
$BackedUpListThisFileName='';
$CurrentListThisDirectory='';
$BackedUpListThisDirectory='';
$ModifiedBackedUpListThisDirectory='';
$FilesToBackUpListThisDirectory='';
$EndOfCurrentList=$No;
$CurrentListLineCount = 0;
$CurrentListLookingForBlankLine = $No;
$CurrentListLookingForDirectoryName = $No;
$EndOfBackedUpList=$No;
$BackedUpListLineCount = 0;
$BackedUpListLookingForBlankLine = $No;
$BackedUpListLookingForDirectoryName = $No;
$NewFilesOnly = $No;
$DebugPerlMatches = $No;
$LimitNumberOfFilesInArchive = $No;
$NumberOfFilesInArchive = 0;
$LimitSizeOfArchive = $No;
$SizeOfArchive = 0;
$StartedReadingBackedUpList = $No;
$StartedReadingCurrentList = $No;
$BigFilesOnly = $No;
$IgnoreDirectories = $No;

# these initializations are needed
$CurrentListLookingForTotalOrFirstFile = $Yes;
$BackedUpListLookingForTotalOrFirstFile = $Yes;

#print("@ARGV\n");

if ( $#ARGV < 7 ) {
  print "not enough command line parameters\n";
  exit 11;
  }

if (!open(FH_DirExcFile,"<$ARGV[0]")) {
  print "unable to open directory exclude file $ARGV[0]\n";
  exit 12;
  }
$DirectoryExcludesCount = 0;
while (<FH_DirExcFile>) {
  next if m/^\$/;  # if the first character is '$', ignore the line
  next if m/^$/;  # if the line is blank, ignore the line
  $DirectoryExcludes[$DirectoryExcludesCount] = $_;
  $DirectoryExcludesCount = $DirectoryExcludesCount + 1;
  }
close(FH_DirExcFile);
#print("number of directory excludes is $DirectoryExcludesCount\n");
#print(@DirectoryExcludes);
chomp(@DirectoryExcludes);

if (!open(FH_IncExcFile,"<$ARGV[1]")) {
  print "unable to open include/exclude file $ARGV[1]\n";
  exit 13;
  }
$IncExcPatternCount = 0;
while (<FH_IncExcFile>) {
  if (substr($_, 0, 4) eq 'inc ') {
    $IncExcPatterns[$IncExcPatternCount] = substr($_, 4);
    $IncludeOrExclude[$IncExcPatternCount] = $Include;
    $IncExcPatternCount = $IncExcPatternCount + 1;
    }
  if (substr($_, 0, 4) eq 'exc ') {
    $IncExcPatterns[$IncExcPatternCount] = substr($_, 4);
    $IncludeOrExclude[$IncExcPatternCount] = $Exclude;
    $IncExcPatternCount = $IncExcPatternCount + 1;
    }
  }
close(FH_IncExcFile);
chomp(@IncExcPatterns);

if (!open(FH_CurrentList,"<$ARGV[2]")) {
  print "unable to open current files list file $ARGV[2]\n";
  exit 14;
  }

if (!open(FH_BackedUpList,"<$ARGV[3]")) {
  print "unable to open backed up files list file $ARGV[3]\n";
  exit 15;
  }

if (!open(FH_ModifiedBackedUpList,">>$ARGV[4]")) {
  print "unable to open modified backed up files list file $ARGV[4]\n";
  exit 16;
  }

if (!open(FH_DeletedFileList,">>$ARGV[5]")) {
  print "unable to open deleted files list file $ARGV[5]\n";
  exit 17;
  }

if (!open(FH_FilesToBackUpList,">>$ARGV[6]")) {
  print "unable to open list of files to back up file $ARGV[6]\n";
  exit 18;
  }

if (!open(FH_FilesToBackUpListLs,">>$ARGV[7]")) {
  print "unable to open list of files to back up file, ls format:\n",
              "$ARGV[7]\n";
  exit 19;
  }

$A = 7;
while ( $A < $#ARGV ) {
  $A = $A + 1;
  #print("current option is $ARGV[$A]\n");
  if ($ARGV[$A] eq '--IgnoreDirectories') {
    $IgnoreDirectories = $Yes;
    next;
    }
  if ($ARGV[$A] eq '--NewFilesOnly') {
    $NewFilesOnly = $Yes;
    next;
    }
  if (substr($ARGV[$A],0,20) eq '--DebugPerlMatching=') {
    $DebugPerlMatches = $Yes;
    #print("debugging of Perl matching is on\n");
    $SA = substr($ARGV[$A],20);
    if (!open(FH_DebugPerlMatches,">>$SA")) {
      $DebugPerlMatches = $No;
      print("debugging of Perl matching is turned off because the file\n",
               "$SA\n","cannot be opened for append.\n");
      }
    next;
    }
  #print substr($ARGV[$A],0,23);print("\n");
  if (substr($ARGV[$A],0,23) eq '--MaxNumFilesInArchive=') {
    $MaximumNumberOfFilesInArchive = substr($ARGV[$A],23);
    #print("max files is $MaximumNumberOfFilesInArchive\n");
    if ( $MaximumNumberOfFilesInArchive != 0 ) {
      $LimitNumberOfFilesInArchive = $Yes;
      }
    next;
    }
  if (substr($ARGV[$A],0,19) eq '--MaxSizeOfArchive=') {
    $MaximumSizeOfArchive = substr($ARGV[$A],19);
    if ( $MaximumSizeOfArchive != 0 ) {
      $LimitSizeOfArchive = $Yes;
      }
    next;
    }
  if (substr($ARGV[$A],0,15) eq '--BigFilesOnly=') {
    $MinumumSizeOfFile = substr($ARGV[$A],15);
    if ( $MinumumSizeOfFile != 0 ) {
      $BigFilesOnly = $Yes;
      }
    #print("minimum size of file is $MinumumSizeOfFile\n");
    next;
    }
  print("CompareFileLists does not understand the following command line option:\n",
        "$ARGV[$A]\n");
  exit 9;
  }

#$DebugPerlMatches = $Yes;
#if (!open(FH_DebugPerlMatches,">>dm")) {
  #$DebugPerlMatches = $No;
  #}


&getNextLineFromCurrentList;
&getNextLineFromBackedUpList;

L_CompareNow:
if ( $EndOfCurrentList == $Yes ) {
  if ( $EndOfBackedUpList == $Yes ) {
    # all done
    #print("number of lines read from current list is ",
           #$CurrentListLineCount,"\n");
    #print("number of lines read from backed up list is ",
           #$BackedUpListLineCount,"\n");
    exit 0;
    }
  goto L_DeletedFile;
  }
if ( $EndOfBackedUpList == $Yes ) {
  goto L_NewFile;
  }
# It is not needed to compare the directory names every time. However,
# sometimes it is needed. We could set up some check routines which would
# determine whether or not it is needed, and only compare the directory
# names when we need to. Comparing strings is fairly slow; probably the
# check routines would run faster; maybe the time saved with fewer compares
# would be more than the time spent on extra check routines. Or maybe not. I
# have chosen to compare the directories every time, because the code is
# simpler this way; it is easier to write and there are fewer places for
# bugs to hide. This also applies to the parts below, where it compares the
# directory names before each write to any of the output files.
$A = &directoryNameCompare($CurrentListThisDirectory,
                           $BackedUpListThisDirectory);
if ( $A == -1 ) {
  # $CurrentListThisDirectory comes before $BackedUpListThisDirectory
  goto L_NewFile;
  }
if ( $A == 1 ) {
  # $BackedUpListThisDirectory comes before $CurrentListThisDirectory
  goto L_DeletedFile;
  }
# If we got to here, the directory names must match
if ( $CurrentListThisLine eq $BackedUpListThisLine ) {
  goto L_UnchangedFile;
  }
$A = $CurrentListThisFileName cmp $BackedUpListThisFileName;
if ( $A == 0 ) {
  goto L_ChangedFile;
  }
if ( $A == -1 ) {
  # $CurrentListThisFileName comes before $BackedUpListThisFileName
  goto L_NewFile;
  }
if ( $A == 1 ) {
  # $BackedUpListThisFileName comes before $CurrentListThisFileName
  goto L_DeletedFile;
  }
print("impossible error, cmp returned something other than 1, 0, or -1\n");
exit 40;

L_UnchangedFile:
# should we check to see if the file is included? maybe it got
# backed up before, but the user changed the include exclude
# criteria so it is now excluded; in which case we might want
# to treat it as a deleted file, so that it will be excluded
# from any future normal restore. ugh, too much work.
# Besides, if we check every unchanged file, that will slow
# the program down a lot.
if ( $ModifiedBackedUpListThisDirectory ne $BackedUpListThisDirectory ) {
  print(FH_ModifiedBackedUpList "\n",$BackedUpListThisDirectory,":\n");
  $ModifiedBackedUpListThisDirectory = $BackedUpListThisDirectory;
  }
print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
&getNextLineFromCurrentList;
&getNextLineFromBackedUpList;
goto L_CompareNow;

L_ChangedFile:
if ( $NewFilesOnly == $Yes ) {
  goto L_UnchangedFile;
  }
$SA = substr($CurrentListThisLine,0,1);
if ( $SA eq 'd'    or    $SA eq 'p' ) {
  # if it is a directory, and the ls listing has changed, but the
  # permissions, owner, and group have not changed, then treat it as
  # unchanged. A directory is not changed unless the permissions, owner,
  # or group changes. Also do the same for named pipes.
  # This does not check for the spaces which mark the beginning and the end
  # of the uid and gid, so there may be trouble if the uid and gid are not
  # in the expected position. But I think that this is not likely to be a
  # problem, so I have not bothered to fix this potential bug.
  if (
    substr($CurrentListThisLine,0,11) eq substr($BackedUpListThisLine,0,11)
        and
    substr($CurrentListThisLine,15,18) eq substr($BackedUpListThisLine,15,18)
        ) { goto L_UnchangedFile }
  }
elsif ( $SA eq 'b'    or     $SA eq 'c' ) {
  # a device is not changed unless permissions, owner, group, or major or
  # minor numbers have changed.
  #print("is this device really changed?\n");
  #print("'", substr($CurrentListThisLine,0,10), "' equals? '",
        #substr($BackedUpListThisLine,0,10), "'\n'",
        #substr($CurrentListThisLine,15,26), "' equals? '",
        #substr($BackedUpListThisLine,15,26), "'\n");
  if (
    # this does not check for the spaces before and after the uid, gid,
    # major and minor numbers; this is not likely to be a problem, but
    # maybe it should check anyway, just in case we have the really weird
    # situation of a device with 10,000 hard links.
    substr($CurrentListThisLine,0,10) eq substr($BackedUpListThisLine,0,10)
        and
    substr($CurrentListThisLine,15,27) eq substr($BackedUpListThisLine,15,27)
        ) { goto L_UnchangedFile }
  #print("device is changed\n");
  }
else {
  # if it is not a directory, and the only thing which has changed
  # is the number of hard links, treat it as unchanged
  if (
    substr($CurrentListThisLine,0,10) eq substr($BackedUpListThisLine,0,10)
        and
    substr($CurrentListThisLine,15) eq substr($BackedUpListThisLine,15)
        ) { goto L_UnchangedFile }
  }
if ( $CurrentListThisDirectory eq '' ) {
  $FullFileName = $CurrentListThisFileName;
  }
else {
  $FullFileName = $CurrentListThisDirectory."/".$CurrentListThisFileName;
  }
if ( &isFileNameIncluded($FullFileName) == $Exclude ) {
  goto L_UnchangedFile;
  }
if ( $BigFilesOnly == $Yes ) {
  if ( &getFileSizeFromLSline($CurrentListThisLine) <
                     $MinumumSizeOfFile ) {
    goto L_UnchangedFile;
    }
  }
$NumberOfFilesInArchive = $NumberOfFilesInArchive + 1;
if ( $LimitNumberOfFilesInArchive == $Yes ) {
  if ( $NumberOfFilesInArchive > $MaximumNumberOfFilesInArchive ) {
    if ( $EndOfBackedUpList == $No ) {
      print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
      while (<FH_BackedUpList>) {
        print(FH_ModifiedBackedUpList $_);
        }
      }
    exit 1;
    }
  }
if ( $LimitSizeOfArchive == $Yes ) {
  #print("checking size of archive\n");
  $SizeOfArchive = $SizeOfArchive +
                   &getFileSizeFromLSline($CurrentListThisLine);
  if ( $SizeOfArchive > $MaximumSizeOfArchive ) {
    #print("archive is too big\n");
    #print("number of files in archive is $NumberOfFilesInArchive\n");
    if ( $NumberOfFilesInArchive == 1 ) {
      print("Error:\n$CurrentListThisLine\n",
          "in $CurrentListThisDirectory\n",
          "is too large for maximum archive size of $MaximumSizeOfArchive\n");
      exit 10
      }
    if ( $EndOfBackedUpList == $No ) {
      #print("finishing backed up list\n");
      print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
      while (<FH_BackedUpList>) {
        print(FH_ModifiedBackedUpList $_);
        }
      }
    exit 1;
    }
  }
if ( $ModifiedBackedUpListThisDirectory ne $CurrentListThisDirectory ) {
  print(FH_ModifiedBackedUpList "\n",$CurrentListThisDirectory,":\n");
  $ModifiedBackedUpListThisDirectory = $CurrentListThisDirectory;
  }
if ( $FilesToBackUpListThisDirectory ne $CurrentListThisDirectory ) {
  print(FH_FilesToBackUpListLs "\n",$CurrentListThisDirectory,":\n");
  $FilesToBackUpListThisDirectory = $CurrentListThisDirectory;
  }
print(FH_ModifiedBackedUpList $CurrentListThisLine,"\n");
print(FH_FilesToBackUpListLs $CurrentListThisLine,"\n");
print(FH_FilesToBackUpList $FullFileName,"\n");
if ( $BackedUpListThisDirectory eq '' ) {
  print(FH_DeletedFileList $BackedUpListThisFileName,"\n");
  }
else {
  print(FH_DeletedFileList $BackedUpListThisDirectory,
                             "/",$BackedUpListThisFileName,"\n");
  }
&getNextLineFromCurrentList;
&getNextLineFromBackedUpList;
goto L_CompareNow;

L_DeletedFile:
# the file from backed up list is not in the current list;
# it must have been deleted since the last backup
if ( $NewFilesOnly == $Yes ) {
  if ( $ModifiedBackedUpListThisDirectory ne $BackedUpListThisDirectory ) {
    print(FH_ModifiedBackedUpList "\n",$BackedUpListThisDirectory,":\n");
    $ModifiedBackedUpListThisDirectory = $BackedUpListThisDirectory;
    }
  print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
  &getNextLineFromBackedUpList;
  goto L_CompareNow;
  }
if ( $BackedUpListThisDirectory eq '' ) {
  print(FH_DeletedFileList $BackedUpListThisFileName,"\n");
  }
else {
  print(FH_DeletedFileList $BackedUpListThisDirectory,
                             "/",$BackedUpListThisFileName,"\n");
  }
&getNextLineFromBackedUpList;
goto L_CompareNow;

L_NewFile:
# the file from current list is not in the backed up list;
# it must have been created since the last backup
# or else it was excluded from the last backup
if ( $CurrentListThisDirectory eq '' ) {
  $FullFileName = $CurrentListThisFileName;
  }
else {
  $FullFileName = $CurrentListThisDirectory."/".$CurrentListThisFileName;
  }
if ( &isFileNameIncluded($FullFileName) == $Exclude ) {
  &getNextLineFromCurrentList;
  goto L_CompareNow;
  }
if ( $BigFilesOnly == $Yes ) {
  if ( &getFileSizeFromLSline($CurrentListThisLine) <
                     $MinumumSizeOfFile ) {
    &getNextLineFromCurrentList;
    goto L_CompareNow;
    }
  }
$NumberOfFilesInArchive = $NumberOfFilesInArchive + 1;
if ( $LimitNumberOfFilesInArchive == $Yes ) {
  if ( $NumberOfFilesInArchive > $MaximumNumberOfFilesInArchive ) {
    if ( $EndOfBackedUpList == $No ) {
      print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
      while (<FH_BackedUpList>) {
        print(FH_ModifiedBackedUpList $_);
        }
      }
    exit 1;
    }
  }
if ( $LimitSizeOfArchive == $Yes ) {
  $SizeOfArchive = $SizeOfArchive +
                   &getFileSizeFromLSline($CurrentListThisLine);
  if ( $SizeOfArchive > $MaximumSizeOfArchive ) {
    if ( $NumberOfFilesInArchive == 1 ) {
      print("Error:\n$CurrentListThisLine\n",
          "in $CurrentListThisDirectory\n",
          "is too large for maximum archive size of $MaximumSizeOfArchive\n");
      exit 10
      }
    if ( $EndOfBackedUpList == $No ) {
      print(FH_ModifiedBackedUpList $BackedUpListThisLine,"\n");
      while (<FH_BackedUpList>) {
        print(FH_ModifiedBackedUpList $_);
        }
      }
    exit 1;
    }
  }
if ( $ModifiedBackedUpListThisDirectory ne $CurrentListThisDirectory ) {
  print(FH_ModifiedBackedUpList "\n",$CurrentListThisDirectory,":\n");
  $ModifiedBackedUpListThisDirectory = $CurrentListThisDirectory;
  }
if ( $FilesToBackUpListThisDirectory ne $CurrentListThisDirectory ) {
  print(FH_FilesToBackUpListLs "\n",$CurrentListThisDirectory,":\n");
  $FilesToBackUpListThisDirectory = $CurrentListThisDirectory;
  }
print(FH_ModifiedBackedUpList $CurrentListThisLine,"\n");
print(FH_FilesToBackUpListLs $CurrentListThisLine,"\n");
print(FH_FilesToBackUpList $FullFileName,"\n");
&getNextLineFromCurrentList;
goto L_CompareNow;



sub getNextLineFromCurrentList {
  my $SA = '';
  my $A = 0;
  #print("getting next line from current list\n");
  $CurrentListPreviousFileName = $CurrentListThisFileName;
  L_getNextLineFromCurrentListStart:
  if ( eof(FH_CurrentList) ) {
    $EndOfCurrentList = $Yes;
    $CurrentListThisFileName = '';
    $CurrentListPreviousDirectory = $CurrentListThisDirectory;
    $CurrentListThisDirectory = '';
    return();
    }
  $CurrentListLineCount = $CurrentListLineCount +1;
  #print("reading line from current list\n");
  #print("looking for blank line = $CurrentListLookingForBlankLine\n");
  #print("looking for directory name = $CurrentListLookingForDirectoryName\n");
  $CurrentListThisLine = <FH_CurrentList>;
  # The next line removes all newline characters from the end of the string.
  # There should be one newline character at the end of the string.
  # This program might run faster if we leave the newline character on the
  # end of the string, because then it would not have to restore the newline
  # character when it writes the string to the output files. However, there
  # is a small chance that the last line in the file might be missing the
  # newline character, so it seems safer to do it this way. Also, if it did
  # not remove newlines, then the sub for extracting filenames would return
  # file, directory, and device names ending in newline; but link names
  # would not end in newline, unless the sub was changed to add newline to
  # the end of link names. Also it expects current directory lines to end in
  # ':', not ":\n".
  chomp($CurrentListThisLine);
  # some versions of ls will put '.:' on the first line of the list,
  # and some versions of ls do not.
  # If the first line of the list is '.:', throw away the line.
  if ( $StartedReadingCurrentList == $No )  {
    $StartedReadingCurrentList = $Yes;
    if ( $CurrentListThisLine eq '.:' ) {
      #print("throwing away '.:' on first line\n");
      goto L_getNextLineFromCurrentListStart
      }
    }
  if ( $CurrentListThisLine eq '' ) {
    $CurrentListLookingForDirectoryName = $Yes;
    if ( $CurrentListLookingForBlankLine == $Yes ) {
      $CurrentListLookingForBlankLine = $No;
      }
    goto L_getNextLineFromCurrentListStart;
    }
  if ( $CurrentListLookingForBlankLine == $Yes ) {
    goto L_getNextLineFromCurrentListStart;
    }
  if ( $CurrentListLookingForTotalOrFirstFile == $Yes ) {
    if ( substr($CurrentListThisLine,0,6) eq 'total ') {
      goto L_getNextLineFromCurrentListStart;
      }
    }
  if ( $CurrentListLookingForDirectoryName == $Yes ) {
    if ( substr($CurrentListThisLine,-1) eq ':' ) {
      $SA = substr($CurrentListThisLine,0,-1);
      if ( &directoryExclude($SA) == $Exclude ) {
        $CurrentListLookingForBlankLine = $Yes;
        goto L_getNextLineFromCurrentListStart;
        }
      $CurrentListPreviousDirectory = $CurrentListThisDirectory;
      $CurrentListThisDirectory = $SA;
      $A = &directoryNameCompare($CurrentListThisDirectory,
                                 $CurrentListPreviousDirectory);
      if ( $A == 0 ) {
        print("Error in current file list: $ARGV[2]\n",
              "line $CurrentListLineCount.\n",
              "The current directory is '$CurrentListThisDirectory'\n",
              "The previous directory is '$CurrentListPreviousDirectory'\n",
              "Perl thinks these directories are the same\n");
        exit 30;
        }
      if ( $A == -1 ) {
        print("Error in current file list: $ARGV[2]\n",
              "line $CurrentListLineCount. Perl thinks directory\n",
              "'$CurrentListThisDirectory'\n",
              "should come before directory\n",
              "'$CurrentListPreviousDirectory'\n");
        exit 30;
        }
      # if we get to here, the directories must be in the correct order
      $CurrentListLookingForDirectoryName = $No;
      $CurrentListLookingForTotalOrFirstFile = $Yes;
      goto L_getNextLineFromCurrentListStart;
      }
    print("Error in current file list: $ARGV[2]\n",
          "line $CurrentListLineCount.\n",
          "This line should end with a colon, ':', ",
          "because it comes after a blank line:\n",
          "'$CurrentListThisLine'\n");
    exit 30;
    }
  if ( $IgnoreDirectories == $Yes ) {
    if ( substr($CurrentListThisLine,0,1) eq 'd' ) {
      # if the first letter is a 'd', then it is a directory; skip it
      goto L_getNextLineFromCurrentListStart;
      }
    }
  $CurrentListThisFileName = &getFileNameFromLSline($CurrentListThisLine);
  if ( $CurrentListThisFileName eq '') {
    print("Error in current file list: $ARGV[2]\n",
          "line $CurrentListLineCount.\n",
          "This line does not contain a file name, ",
          "and is not blank either:\n",
          "'$CurrentListThisLine'\n");
    exit 30;
    }
  if ( $CurrentListLookingForTotalOrFirstFile == $Yes ) {
    $CurrentListLookingForTotalOrFirstFile = $No;
    # If this is the first file,
    # we do not need to compare it to the previous file
    return();
    }
  $A = $CurrentListThisFileName cmp $CurrentListPreviousFileName;
  if ( $A == 1 ) {
    return();
    }
  if ( $A == -1 ) {
    # $CurrentListThisFileName comes before $CurrentListPreviousFileName
    print("Error in current file list: $ARGV[2]\n",
          "line $CurrentListLineCount.\n",
          "The lines are not in a perl compatible order.\n",
          "Perl thinks '$CurrentListThisFileName'\n",
          "should come before '$CurrentListPreviousFileName'\n",
          "If perl and ls do not agree on the correct order, you may have\n",
          "a locale problem; try doing 'export LC_COLLATE=POSIX' before\n",
          "running paranoid backup\n");
    exit 30;
    }
  if ( $A == 0 ) {
    # file names are the same
    print("Error in current file list: $ARGV[2]\n",
          "line $CurrentListLineCount. The filename from the current line\n",
          "is '$CurrentListThisFileName'. The filename from\n",
          "the previous line is '$CurrentListPreviousFileName'\n",
          "Perl thinks these filenames are the same.\n");
    exit 30;
    }
  print("impossible error, cmp returned something other than 1, 0, or -1\n");
  exit 40;
  }


sub getNextLineFromBackedUpList {
  my $SA = '';
  my $A =0;
  #print("getting next line from backed up list\n");
  $BackedUpListPreviousFileName = $BackedUpListThisFileName;
  L_getNextLineFromBackedUpListStart:
  if ( eof(FH_BackedUpList) ) {
    $EndOfBackedUpList = $Yes;
    $BackedUpListThisFileName = '';
    $BackedUpListPreviousDirectory = $BackedUpListThisDirectory;
    $BackedUpListThisDirectory = '';
    return();
    }
  $BackedUpListLineCount = $BackedUpListLineCount +1;
  $BackedUpListThisLine = <FH_BackedUpList>;
  chomp($BackedUpListThisLine);
  # some versions of ls will put '.:' on the first line of the list,
  # and some versions of ls do not.
  # If the first line of the list is '.:', throw away the line.
  if ( $StartedReadingBackedUpList == $No )  {
    $StartedReadingBackedUpList = $Yes;
    if ( $BackedUpListThisLine eq '.:' ) {
      goto L_getNextLineFromBackedUpListStart
      }
    }
  if ( $BackedUpListThisLine eq '' ) {
    $BackedUpListLookingForDirectoryName = $Yes;
    if ( $BackedUpListLookingForBlankLine == $Yes ) {
      $BackedUpListLookingForBlankLine = $No;
      }
    goto L_getNextLineFromBackedUpListStart;
    }
  if ( $BackedUpListLookingForBlankLine == $Yes ) {
    goto L_getNextLineFromBackedUpListStart;
    }
  if ( $BackedUpListLookingForTotalOrFirstFile == $Yes ) {
    if ( substr($BackedUpListThisLine,0,6) eq 'total ') {
      goto L_getNextLineFromBackedUpListStart;
      }
    }
  if ( $BackedUpListLookingForDirectoryName == $Yes ) {
    if ( substr($BackedUpListThisLine,-1) eq ':' ) {
      $SA = substr($BackedUpListThisLine,0,-1);
      if ( &directoryExclude($SA) == $Exclude ) {
        $BackedUpListLookingForBlankLine = $Yes;
        goto L_getNextLineFromBackedUpListStart;
        }
      $BackedUpListPreviousDirectory = $BackedUpListThisDirectory;
      $BackedUpListThisDirectory = $SA;
      #$A = $BackedUpListThisDirectory cmp $BackedUpListPreviousDirectory;
      $A = &directoryNameCompare($BackedUpListThisDirectory,
                                 $BackedUpListPreviousDirectory);
      if ( $A == 0 ) {
        print("Error in backed up file list $ARGV[3]\n",
              "line $BackedUpListLineCount.\n",
              "The current directory is '$BackedUpListThisDirectory'\n",
              "The previous directory is '$BackedUpListPreviousDirectory'\n",
              "Perl thinks these directories are the same\n");
        exit 31;
        }
      if ( $A == -1 ) {
        print("Error in backed up file list $ARGV[3]\n",
              "line $BackedUpListLineCount. Perl thinks directory\n",
              "'$BackedUpListThisDirectory'\n",
              "should come before directory\n",
              "'$BackedUpListPreviousDirectory'\n");
        #print("\$BackedUpListThisDirectory is '$BackedUpListThisDirectory'\n");
        #print("\$BackedUpListPreviousDirectory is '$BackedUpListPreviousDirectory'\n");
        #print("\$A is '$A'\n");
        exit 31;
        }
      # if we get to here, the directories must be in the correct order
      $BackedUpListLookingForDirectoryName = $No;
      $BackedUpListLookingForTotalOrFirstFile = $Yes;
      goto L_getNextLineFromBackedUpListStart;
      }
    print("Error in backed up file list: $ARGV[3]\n",
          "line $BackedUpListLineCount.\n",
          "This line should end with a colon, ':', ",
          "because it comes after a blank line:\n",
          "'$BackedUpListThisLine'\n");
    exit 31;
    }
  if ( $IgnoreDirectories == $Yes ) {
    if ( substr($BackedUpListThisLine,0,1) eq 'd' ) {
      # if the first letter is a 'd', then it is a directory; skip it
      goto L_getNextLineFromBackedUpListStart;
      }
    }
  $BackedUpListThisFileName = &getFileNameFromLSline($BackedUpListThisLine);
  if ( $BackedUpListThisFileName eq '') {
    print("Error in backed up file list: $ARGV[3]\n",
          "line $BackedUpListLineCount.\n",
          "This line does not contain a file name, ",
          "and is not blank either:\n",
          "'$BackedUpListThisLine'\n");
    exit 31;
    }
  if ( $BackedUpListLookingForTotalOrFirstFile == $Yes ) {
    $BackedUpListLookingForTotalOrFirstFile = $No;
    # If this is the first file,
    # we do not need to compare it to the previous file
    return();
    }
  $A = $BackedUpListThisFileName cmp $BackedUpListPreviousFileName;
  if ( $A == 1 ) {
    return();
    }
  if ( $A == -1 ) {
    # $BackedUpListThisFileName comes before $BackedUpListPreviousFileName
    print("Error in backed up file list: $ARGV[3]\n",
          "line $BackedUpListLineCount.\n",
          "The lines are not in a perl compatible order.\n",
          "Perl thinks '$BackedUpListThisFileName'\n",
          "should come before '$BackedUpListPreviousFileName'\n",
          "If perl and ls do not agree on the correct order, you may have\n",
          "a locale problem; try doing 'export LC_COLLATE=POSIX' before\n",
          "running paranoid backup\n");
    exit 31;
    }
  if ( $A == 0 ) {
    # file names are the same
    print("Error in backed up file list: $ARGV[3]\n",
          "line $BackedUpListLineCount. The filename from the current line\n",
          "is '$BackedUpListThisFileName'. The filename from\n",
          "the previous line is '$BackedUpListPreviousFileName'\n",
          "Perl thinks these filenames are the same.\n");
    exit 31;
    }
  print("impossible error, cmp returned something other than 1, 0, or -1\n");
  exit 40;
  }


sub directoryExclude {
  #print(STDERR "sub directory excludes\n");
  for(@DirectoryExcludes) {
    if ( $_[0] =~ m/$_/ ) {
      if ( $DebugPerlMatches == $Yes ) {
        print(FH_DebugPerlMatches "$_[0] matches $_, directory excluded\n");
        #print("$_[0] matches $_, directory excluded\n");
        }
      return($Exclude);
      }
    elsif ( $DebugPerlMatches == $Yes )
        { print(FH_DebugPerlMatches "$_[0] does not match $_\n") }
    }
  if ( $DebugPerlMatches == $Yes ) {
    print(FH_DebugPerlMatches "no matches for $_[0], directory included\n");
    }
  return($Include);
  }

sub isFileNameIncluded  {
  # start with the last pattern
  $IncExcPatternCount = $#IncludeOrExclude;
  # for each pattern ...
  while ( $IncExcPatternCount >= 0 ) {
    # if filename matches pattern ...
    if ( $_[0] =~ m/$IncExcPatterns[$IncExcPatternCount]/ ) {
      # if we are debugging the matching, output the match as a succes
      if ( $DebugPerlMatches == $Yes ) {
        if ( $IncludeOrExclude[$IncExcPatternCount] == $Include ) {
          print(FH_DebugPerlMatches "$IncExcPatterns[$IncExcPatternCount]",
                                        " matches $_[0], included\n");
          }
        else {
          print(FH_DebugPerlMatches "$IncExcPatterns[$IncExcPatternCount]",
                                        " matches $_[0], excluded\n");
          }
        }
      # return the include or exclude value stored in
      # $IncludeOrExclude[$IncExcPatternCount]
      return($IncludeOrExclude[$IncExcPatternCount]);
      }
    # the pattern did not match the filename    
    # if we are debugging the matching, output the match as a failure
    if ( $DebugPerlMatches == $Yes ) {
      print(FH_DebugPerlMatches "$IncExcPatterns[$IncExcPatternCount]",
                                        "  does not match $_[0]\n");
      }
    # try the previous pattern
    $IncExcPatternCount = $IncExcPatternCount - 1;
    }
  # none of the patterns matched, so exclude the file
  return($Exclude);
  }

sub getFileNameFromLSline  {
  # The obvious way to do this is to count the number of bytes
  # to where the file name usually begins, and extract the string
  # from there to the end of the line. However, names of
  # symbolic links end at ' ->', not at the end of the line. If a file has
  # more than 999 hard links, like a directory with more
  # than 998 subdirectories, then the format of the ls listing
  # changes; extra characters are added for the number of hard links.
  # If the file size is greater than 99999999, then extra characters are
  # added for the file size.
  # We could start at the end of the line, and search backwards for the
  # space which marks the beginning of the file name. But that would
  # cause problems if we had filenames which include spaces.
  # Also, we need to be careful because different versions of ls have
  # slightly different output formats
  # ls lines using redhat 5.0, ls version 3.16, numeric uid
  #-rwxr-xr-x   1 0        0            6746 Fri Apr 13 13:04:10 2001 autoexec.bak
  #crw-rw-rw-   2 root     root      30,   1 Sun Dec 03 17:21:31 2000 X0R
  #-rw-r--r--   1 0        0        180132680 Fri Mar 22 17:38:20 2002 windows98se.tar.gz
  #drwxr-xr-x 1002 9        13          12288 Tue Aug 29 13:28:21 2000 message.id
  # ls lines using redhat 5.0, ls version 3.16, no numeric uid
  #-rwxr-xr-x   1 root     root         1087 Wed Mar 07 14:48:58 2001 dsovrnet.mac
  # ls lines using redhat 7.2, no numeric uid
  #-rw-r--r--    1 root     root     180132680 Sat Feb 16 18:14:49 2002 windows98se.tar.gz
  #-rw-r--r--    1 root     root       170424 Tue Oct 17 06:10:16 2000 winnt3_clipsave.exe

  my $ExtraCharacters = 0; my $SA;
  # Find the space which comes after the number of hard links.
  # the next while block is wrapped in an if block. We could move the line
  # which increments $ExtraCharacters to the end of the while block and
  # throw away the if block; that would made the source simpler. However,
  # this way we skip additions of 14 and $ExtraCharacters when we know
  # $ExtraCharacters is zero; thus this is more complex but faster.
  if ( substr($_[0],14,1) ne ' ' )  { while ()  {
    $ExtraCharacters = $ExtraCharacters + 1;
    $SA = substr($_[0],(14+$ExtraCharacters),1);
    #print('byte ',14+$ExtraCharacters," is '$SA'\n");
    last if $SA eq ' ';
    # If we get to the end of the string without finding a space, that
    # is an error. We return an empty string, and whatever called this
    # should recognize that as an error.
    return('') if $SA eq '';
    }}
    #print("found space after number of hard links in\n'$_[0]'\n");
  #print('$ExtraCharacters is ',"$ExtraCharacters \n");
  # find the space which comes after the file size
  while ()  {
    $SA = substr($_[0],(41+$ExtraCharacters),1);
    #print('byte ',41+$ExtraCharacters," is '$SA'\n");
    last if $SA eq ' ';
    return('') if $SA eq '';
    $ExtraCharacters = $ExtraCharacters + 1;
    }
  #print("found space after file size in\n'$_[0]'\n");
  # find the space which comes after the file time
  # I think that there are unlikely to be any extra characters between
  # the end of the file size and the end of the file time. Thus this next
  # while block could probably be deleted, and that would be faster.
  # But I am checking for extra characters anyway because I am paranoid.
  while ()  {
    $SA = substr($_[0],(66+$ExtraCharacters),1);
    #print('byte ',66+$ExtraCharacters," is '$SA'\n");
    last if $SA eq ' ';
    return('') if $SA eq '';
    $ExtraCharacters = $ExtraCharacters + 1;
    }
  #print("found space after file time in\n'$_[0]'\n");
  my $NumberOfCharactersBeforeFileName = 67 + $ExtraCharacters;
  #print('index is ',(index($_[0],' -> ',$NumberOfCharactersBeforeFileName)),"\n");
  #print('length of filename is ',(index($_[0],' -> ',$NumberOfCharactersBeforeFileName)-$NumberOfCharactersBeforeFileName),"\n");
  # if it is a link, the filename ends with ' ->'
  if (substr($_[0],0,1) eq 'l') {
    $A = index($_[0],' -> ',$NumberOfCharactersBeforeFileName);
    # if the first character is 'l', then it is a link; so we look for
    # ' -> ', which marks the end of the name of the link. What if
    # we can not find ' -> '?
    if ( $A <= 0 ) {
      # Probably that would be an error. But this sub does not know if the
      # error was in the backed up list or the current list. So we should
      # use the file name, line number, and exit code as additional parameters
      # to this sub and have this sub report the error, or else set
      # LinklessLink=$Yes and return nothing and let the caller figure out
      # the difference between a no filename error and a linkless link error.
      #LinklessLink=$Yes;
      #return();
      # On the other hand, maybe if we can not find ' ->' we should extract
      # the name like a normal file and not report any error; that would allow
      # the backup to continue.
      # Note that I have seen linkless links in /proc, and apparently that
      # was not an error, but you should not be backing up /proc anyway;
      # if /proc is included in the listings you should be directory-excluding
      # it, and then it will not matter if it has linkless links.
      print("warning: linkless link:\n$_[0]\n");
      return(substr($_[0],$NumberOfCharactersBeforeFileName));
      }
    return(substr($_[0],$NumberOfCharactersBeforeFileName,
                  $A-$NumberOfCharactersBeforeFileName));
    }
  # for everything else, the filename ends with the end of the line
  return(substr($_[0],$NumberOfCharactersBeforeFileName));
  }

sub directoryNameCompare  {
  my @A = split('/',$_[0]);  # this directory
  my @B = split('/',$_[1]);  # previous directory
  my $A = 0;
  my $B = 0;
  L_DirectoryNameCompareNext:
  if ( $A > $#A ) {
    if ( $A > $#B ) {
      return(0);
      }
    return(-1);
    }
  if ( $A > $#B ) {
    return(1);
    }
  #print("comparing $A[$A] and $B[$A]\n");
  $B = $A[$A] cmp $B[$A];
  # 0 is same, 1 is normal, -1 is reverse
  if ( $B == 0 )  {
    $A = $A + 1;
    goto L_DirectoryNameCompareNext;
    }
  if ( $B == 1 )  {
    return(1);
    }
  if ( $B == -1 )  {
    return(-1);
    }
  }

sub getFileSizeFromLSline  {
  # if the first character is '-' then it is a regular file; return the
  # file size
  #print(substr($_[0],0,1));print("   ");print(substr($_[0],33,8));
  #print("   ");print(substr($_[0],33,8) + 1);print("\n");
  return(substr($_[0],33,8)) if ( substr($_[0],0,1) eq '-' );
  # otherwise it must be a block or character device, pipe, link,
  # or something like that; return 0
  return(0);
  }




__END__

Maybe if a file is excluded, we should add it to the list of changed and
deleted files, even if the excluded file is not changed or deleted. That
way, if the user changed the include/exclude criteria, excluded but formerly
included files would be automatically removed from the control lists. Or we
could make this optional.




This perl script compares two file lists and makes a list of files to be
backed up.

command line parameters should be:
 $ARGV[0]    input, file name, directory exclude patterns
 $ARGV[1]    input, file name, include exclude patterns
 $ARGV[2]    input, file name, current file list
 $ARGV[3]    input, file name, backed up file list
 $ARGV[4]    output, file name, modified backed up file list
 $ARGV[5]    output, file name, list of files to be removed from old
             backups (deleted or changed files)
 $ARGV[6]    output, file name, list of files to be backed up (new or
             changed files)
 $ARGV[7]    output, like $ARGV[6] but in ls long format

additional optional parameters:
       --IgnoreDirectories
       --NewFilesOnly
       --DebugPerlMatching=[file name]

exit codes:
 0    ok, done
 1    max size of archive reached

 9    incorrect command line option
10    archive too big when no files have been selected
11    not enough command line parameters
12    unable to read directory excludes file
13    unable to read include/exclude list filed
14    unable to read current files list file
15    unable to read backed up files list file
16    unable to write to modified backed up files list file
17    unable to write to deleted files list file
18    unable to write to files to back up list file
19    unable to write to files to back up list file, ls format

30    current files list not in expected format
31    backed up files list not in expected format

40    impossible error (cmp returned something other than 1,0, or -1)

code which has been commented out with no space after the '#'
is debugging code, which has been left in place in case it is needed
for debugging any future revisions.

In several places I have used
   if ( substr([something]) eq [something] {
perl matching could be used instead, like
   if ( [something] =~ m/[something]/ ) {
which would be better? I do not know, but matching is an important feature
of perl, so it is possible that the perl compiler optimizes m// better
than it optimizes substr(). On the other hand, I think that substr() is
more like machine code than m//, so optimizing substr() ought to be easier
than optimizing m//, so the compiler ought to optimize substr() at least
as well as m//, if not better.

This program does a lot of error checking. Maybe I am paranoid, but I
think it is better to have too much error checking than too little.
So maybe you could streamline this program by eliminating some of the
error checking. For example, this program checks to see if both file
lists are in alphabetical order. Since ls is supposed to make the file
lists in alphabetical order, you may think this check is not needed.

man perlop says patterns are recompiled every
time they are evaluated, and this is expensive, and this should
be avoided if possible. Therefore, this program would probably run
faster if includes and excludes were inserted directly into the program.
For example, if the include/exclude file goes like this:
   inc .
   exc /~$
   inc ^notes/~$
Then instead we could change sub isFileNameIncluded to:
    return($Include) if $_[0]=~m/^notes\~$/o;
    return($Exclude) if $_[0]=~m/\~$/o;
    return($Include) if $_[0]=~m/./o;
    return($Exclude);
Note the order is reversed; in the file '.' is first, but in the code
'.' is last.
Something similar could be done for the directory excludes.
Maybe the o option to m// is not needed because the patterns contain no
variables.
Note this brief example does not include code for --DebugPerlMatching
Of course, if the includes/excludes are coded into the program, you have
to have a different version of the program for each configuration.
This program could be split into two parts: The first part would read the
include/exclude and directory exclude files, and then add the
corresponding code to the second part. It could also remove code for
unused options. Then the second part would run. Then the second part
would be deleted. The next time you want to run it, the first part would
create the second part again.


The sub getFileSizeFromLSline does not care that the location of the file
size in the ls line will be changed if there are more than 999 hard links,
if the file size is more than 99,999,999 bytes, etc. In these cases, it will
not get the correct file size. I have not bothered to fix this bug because
the sub getFileSizeFromLSline is only used if we use options
--MaxNumFilesInArchive, --MaxSizeOfArchive, or --BigFilesOnly; and after I
created these options, I decided these options were useless and should be
removed; but I was too lazy to remove them or fix the bugs in them.
