unit stringdiff;

(* String difference routines:

   Barry Brannan, September 1997.
   barrylb@poboxes.com
   http://www.poboxes.com/barrylb

   Bug reports, suggestions welcome. If you make any improvements I would
   appreciate if you sent them to me.

   Some possible improvements...
      * faster speed
      * cleaner code

   The routines:
   -------------

   computeDiff(blocksize, original, second)  -->  diffmap
   extractDiff(original, diffmap)  -->  second
   dumpDifference(diffmap)  -->  string illustrating diffmap

   nb blocksize is stored in diffmap


   What it is useful for:
   ----------------------

   Suppose you are downloading pages of data from an online service at regular
   intervals. Rather than saving whole snapshots of the data each time, you
   save the first page then save "difference maps" for each update. If the
   pages don't change much then you have huge gains in storage efficency.

   How it works:
   -------------

   1. Each string (original, second) is divided into "blocks" of the specified
      blocksize.

   2. Algorithm:

      For each block in second do
      Begin
          Find corresponding block in original.
          If found block in original then
               write block number to diffmap
          Else
               write literal text to diffmap
      End.

   3. When trying to find corresponding block in original, the routines attempt
      to find a consecutive block rather than just the first block to optimize
      storage efficiency. See "Diffmap format", below.

   4. The blocksize would ideally be 1 but that generally is too slow, so a
      blocksize of say 32 byte would be suitable for files under 64K. The
      maximum number of blocks in a file cannot exceed 65535 so you may have
      to increase the block size anyway if the files are too large.


   Diffmap format:
   ---------------

   Starts with one byte magic number then two bytes indicating the blocksize
         then followed by a sequence of codes...

         chr #1  --  source block range indicator, followed by four bytes. First two are
                     start block number, second two are end block number.
         chr #2  --  literal string, followed by two bytes indicating the number of
                     characters, then the characters themselves.
         chr #3  --  single source block, followed by two bytes indicating the source
                     block number.

         Current magic number is '1' (ascii 49)

*)

interface

function IntegerToTwoCharString(n: integer): string;
(* Encode integer into diffmap *)

function TwoCharStringToInteger(s: string): integer;
(* Decode integer from diffmap *)

function computeDifference(block_size: integer; original, second: string): string;
(* Compute difference between original and other and return "difference map" string *)

function extractDifference(original, diffmap: string): string;
(* Extract other string using original string and diffmap *)

function dumpDifference(diffmap: string): string;
(* Illustrates diffmap as text *)

implementation

uses SysUtils;


(* Let's hope this works, I grabbed it from D3 RTL source for D2 people who
   don't have it... *)
function CompareMem(P1, P2: Pointer; Length: Integer): Boolean; assembler;
asm
        PUSH    ESI
        PUSH    EDI
        MOV     ESI,P1
        MOV     EDI,P2
        MOV     EDX,ECX
        XOR     EAX,EAX
        AND     EDX,3
        SHR     ECX,1
        SHR     ECX,1
        REPE    CMPSD
        JNE     @@2
        MOV     ECX,EDX
        REPE    CMPSB
        JNE     @@2
@@1:    INC     EAX
@@2:    POP     EDI
        POP     ESI
end;

function IntegerToTwoCharString(n: integer): string;
begin
     result := chr(n shr 8) + chr(n and $FF);
end;

function TwoCharStringToInteger(s: string): integer;
begin
     result := (ord(s[1]) shl 8) + ord(s[2]);
end;

function computeDifference(block_size: integer; original, second: string): string;
var
   found_i1: integer;
   i1: integer;
   i2: integer;
   block, start_block, end_block: integer;
   foundblock: boolean;
   literal: string;
   block_2: string;
   max_i1: integer;
   poriginal: pchar;
   pblock2: pchar;

   function outputBlock(start_block, end_block: integer): string;
   begin
        if start_block = end_block then
           result := #3 + IntegerToTwoCharString(start_block)
        else
            result := #1 + IntegerToTwoCharString(start_block) + IntegerToTwoCharString(end_block);
   end;

   function outputLiteral(literal: string): string;
   begin
        result := #2 + IntegerToTwoCharString(length(literal)) + literal;
   end;

begin
     result := '1' + IntegerToTwoCharString(block_size);
     literal := '';
     i2 := 1;
     max_i1 := length(original)-block_size+1;
     poriginal := pchar(original) - 1;
     start_block := -1;
     end_block := -1;
     while (i2 <= length(second)) do
     begin
          (* Step 1: search for block in original *)

          i1 := 1;
          foundblock := false;
          block_2 := copy(second, i2, block_size);
          (* Special case at end of second string if blocksize does not fit perfectly *)
          if length(block_2) <> block_size then (* end of second, check if matches end of original *)
          begin
               found_i1 := ((length(original)-1) div block_size) * block_size + 1; (* start of last block *)
               if block_2 = copy(original, found_i1, block_size) then
                  foundblock := true;
          end
          else
          begin
               pblock2 := pchar(block_2);
               while (i1 <= max_i1) do
               begin
                    if compareMem(poriginal + i1, pblock2, block_size) then
                    begin
                         foundblock := true;
                         found_i1 := i1;
                         if (start_block = -1) or (((i1-1) div block_size) = (end_block + 1)) then
                            break; (* found consecutive block *)
                    end;

                    i1 := i1 + block_size;
               end;
          end;

          (* Step 2: process block if found or process literal *)

          if foundblock then  (* Found block, append it to output *)
          begin
              if literal <> '' then (* Previous literal exists, output to diffmap *)
              begin
                 result := result + outputLiteral(literal);
                 literal := '';
              end;

              i2 := i2 + block_size; (* Move to next block in second *)
              block := (found_i1-1) div block_size; (* find block number in original string from index *)
              if start_block = -1 then  (* no block exists ? *)
              begin
                 start_block := block; (* set block *)
                 end_block := block;
              end
              else
              begin
                   if block = end_block + 1 then (* consecutive block number; *)
                      inc(end_block)  (* adjust end block *)
                   else
                   begin  (* not consecutive; output current block *)
                        result := result + outputBlock(start_block, end_block);
                        start_block := block; (* start next block *)
                        end_block := block;
                   end
              end
          end
          else   (* Did not find block, append literal *)
          begin
             if start_block <> -1 then   (* output previous block if exists *)
                result := result + outputBlock(start_block, end_block);

             literal := literal + second[i2];  (* append literal *)

             if length(literal) = 65535 then (* max size of literal, output *)
              begin
                 result := result + outputLiteral(literal);
                 literal := '';
              end;

             start_block := -1; (* no block *)
             inc(i2);  (* move to next char in second string *)
          end
     end;
     (* end of string, output remaining stuff *)
     if (start_block <> -1) then
        result := result + outputBlock(start_block, end_block)
     else
         if literal <> '' then
            result := result + outputLiteral(literal)
end;

function block_start_index(index: integer; blocksize: integer): integer;
begin
     result := index * blocksize + 1;
end;

function block_end_index(index: integer; blocksize: integer): integer;
begin
     result := (index+1) * blocksize
end;

function extractDifference(original, diffmap: string): string;
var
   block_size: integer;
   i: integer;
   literal_length: integer;
   start_block: integer;
   end_block: integer;
begin
     if diffmap[1] <> '1' then
     begin
          result := '';
          exit;
     end;

     block_size := TwoCharStringToInteger(copy(diffmap, 2, 2));
     result := '';
     i:=4;
     while (i<=length(diffmap)) do
     begin
          if diffmap[i] = #1 then
          begin
             start_block := block_start_index(TwoCharStringToInteger(copy(diffmap, i+1, 2)), block_size);
             end_block := block_end_index(TwoCharStringToInteger(copy(diffmap, i+3, 2)), block_size);
             result := result + copy(original, start_block, end_block-start_block+1);
             i:=i+5;
          end
          else
          if diffmap[i] = #2 then
          begin
               literal_length := TwoCharStringToInteger(copy(diffmap, i+1, 2));
               result := result + copy(diffmap, i+3, literal_length);
               i := i + 3 + literal_length;
          end
          else
          if diffmap[i] = #3 then
          begin
             start_block := block_start_index(TwoCharStringToInteger(copy(diffmap, i+1, 2)), block_size);
             end_block := block_end_index(TwoCharStringToInteger(copy(diffmap, i+1, 2)), block_size);
             result := result + copy(original, start_block, end_block - start_block + 1);
             i:=i+3;
          end
     end;
end;

function dumpDifference(diffmap: string): string;
var
   block_size: integer;
   i: integer;
   literal_length: integer;
   start_block: integer;
   end_block: integer;
begin
     if diffmap[1] <> '1' then
     begin
          result := '';
          exit;
     end;

     block_size := TwoCharStringToInteger(copy(diffmap, 2, 2));
     result := 'Block size: ' + inttostr(block_size) + #13#10;
     i:=4;
     while (i<=length(diffmap)) do
     begin
          if diffmap[i] = #1 then
          begin
             start_block := TwoCharStringToInteger(copy(diffmap, i+1, 2));
             end_block := TwoCharStringToInteger(copy(diffmap, i+3, 2));
             result := result + 'Block ' + inttostr(start_block) + '-' + inttostr(end_block) + #13#10;
             i:=i+5;
          end
          else
          if diffmap[i] = #2 then
          begin
             literal_length := TwoCharStringToInteger(copy(diffmap, i+1, 2));
             result := result + '<literal ' + inttostr(literal_length) + ' chars.>' + #13#10;
             i := i + 3 + literal_length;
          end
          else
          if diffmap[i] = #3 then
          begin
             start_block := TwoCharStringToInteger(copy(diffmap, i+1, 2));
             result := result + 'Block ' + inttostr(start_block) + #13#10;
             i:=i+3;
          end
     end;
end;


end.
