zip-compress-reduce.adb

Back to...
Source file : zip-compress-reduce.adb


--  Legal licensing note:

--  Copyright (c) 2009 .. 2023 Gautier de Montmollin
--  SWITZERLAND

--  Permission is hereby granted, free of charge, to any person obtaining a copy
--  of this software and associated documentation files (the "Software"), to deal
--  in the Software without restriction, including without limitation the rights
--  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
--  copies of the Software, and to permit persons to whom the Software is
--  furnished to do so, subject to the following conditions:

--  The above copyright notice and this permission notice shall be included in
--  all copies or substantial portions of the Software.

--  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
--  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
--  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
--  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
--  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
--  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
--  THE SOFTWARE.

--  NB: this is the MIT License, as found on the site
--  http://www.opensource.org/licenses/mit-license.php
--
-----------------
--  "Reduce" method - probabilistic reduction with a Markov chain.
--  See package specification for details.
--

--  Change log:
--
--  7-Feb-2009: GdM: added a cache for LZ77 output to make 2nd phase faster

with LZ77;

with Ada.Text_IO;

procedure Zip.Compress.Reduce
  (input,
   output           : in out Zip_Streams.Root_Zipstream_Type'Class;
   input_size_known :        Boolean;
   input_size       :        Zip_64_Data_Size_Type;  --  ignored if unknown
   feedback         :        Feedback_Proc;
   method           :        Reduction_Method;
   CRC              : in out Interfaces.Unsigned_32;  --  only updated here
   crypto           : in out CRC_Crypto.Crypto_pack;
   output_size      :    out Zip_64_Data_Size_Type;
   compression_ok   :    out Boolean)  --  indicates when compressed <= uncompressed
is
  use Interfaces;

  reduction_factor : constant Positive :=
    1 + Compression_Method'Pos (method) - Compression_Method'Pos (Reduce_1);
  use Zip_Streams;

  DLE_code : constant := 144;  --  "Escape" character, leading to a Distance - Length code.
  subtype Symbol_range is Integer range 0 .. 255;
  subtype Follower_range is Symbol_range range 0 .. 31;
  --  PKWARE appnote.txt limits to 32 followers.
  --  Above 32, you get "PKUNZIP: (W03) Warning! file has bad table"
  --  Up to 63 is indeed possible and accepted by unzip <=5.12, WinZip
  --  and of course Zip-Ada :-)
  --  Optimum with 63 is extremely rare, the gain on test
  --  files showing a 63 is 0.02%.
  --  Then, we prefer compatibility here.

  Followers : array (Symbol_range, Follower_range) of Symbol_range :=
    (others => (others => 0));
  Slen : array (Symbol_range) of Symbol_range;

  --  Bits taken by (x-1) mod 256:
  B_Table : constant array (Symbol_range) of Integer :=
      (0        => 8,
       1 ..   2 => 1,
       3 ..   4 => 2,
       5 ..   8 => 3,
       9 ..  16 => 4,
      17 ..  32 => 5,
      33 ..  64 => 6,
      65 .. 128 => 7,
     129 .. 255 => 8);

  ------------------
  -- Buffered I/O --
  ------------------

  IO_buffers : IO_Buffers_Type;

  procedure Put_byte (B : Byte) is
  begin
    IO_buffers.OutBuf (IO_buffers.OutBufIdx) := B;
    IO_buffers.OutBufIdx := IO_buffers.OutBufIdx + 1;
    if IO_buffers.OutBufIdx > IO_buffers.OutBuf'Last then
      Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
    end if;
  end Put_byte;

  --------------------------------------------------------------------------

  -----------------
  -- Code buffer --
  -----------------

  Save_byte : Byte;  --  Output code buffer
  Bits_used : Byte;  --  Index into output code buffer

  procedure Flush_output is
  begin
    if Bits_used /= 0 then
      Put_byte (Save_byte);
    end if;
    if IO_buffers.OutBufIdx > 1 then
      Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
    end if;
  end Flush_output;

  procedure Put_code (Code : Byte; Code_size : Natural) is
    Code_work : Byte;
    temp, Save_byte_local, Bits_used_local : Byte;
  begin
    Code_work := Code;
    temp := 0;
    Save_byte_local := Save_byte;
    Bits_used_local := Bits_used;
    for count in reverse 1 .. Code_size loop
      temp := 0;
      if Code_work mod 2 = 1 then
        temp := temp + 1;
      end if;
      Code_work := Code_work  / 2;
      temp := Shift_Left (temp, Integer (Bits_used_local));
      Bits_used_local := Bits_used_local + 1;
      Save_byte_local := Save_byte_local or temp;
      if Bits_used_local = 8 then
        Put_byte (Save_byte_local);
        Save_byte_local := 0;
        temp := 0;
        Bits_used_local := 0;
      end if;
    end loop;
    Save_byte := Save_byte_local;
    Bits_used := Bits_used_local;
  end Put_code;

  procedure Show_symbol (S : Symbol_range) is
  begin
    if S in 32 .. 126 then
      Ada.Text_IO.Put (Character'Val (S));
    else
      Ada.Text_IO.Put ('{' & Symbol_range'Image (S) & '}');
    end if;
  end Show_symbol;

  procedure Save_Followers is
  begin
    for X in reverse Symbol_range loop
      Put_code (Byte (Slen (X)), 6); -- max 2**6 followers per symbol
      for I in 0 .. Slen (X) - 1  loop
        Put_code (Byte (Followers (X, I)), 8);
      end loop;
    end loop;
  end Save_Followers;

  ----------------------------------------
  -- Probabilistic back-end compression --
  ----------------------------------------

  subtype Count is Integer_32;

  markov_d : array (Symbol_range, Symbol_range) of Count :=
    (others => (others => 0));

  --  Build probability of transition from symbol i to symbol j:
  --
  --     markov(i,j) = P(symbol i is followed by j)
  --                 = markov_d(i,j) / sum_k( markov_d(i,k))
  --
  --  Only the discrete matrix markov_d is stored.

  total_row : array (Symbol_range) of Count;
  --  total_row(i) = sum_k( markov_d(i,k))

  order : array (Symbol_range, Symbol_range) of Symbol_range;

  use_probas : constant Boolean := True;
  trace      : constant Boolean := False;

  --  We use the most significant quantiles of each row of the Markov matrix
  --  to allow for a frequent coding that is shorter than the symbol itself.
  --  Otherwise, why doing all that ;-) ?...

  has_follower : array (Symbol_range, Symbol_range) of Boolean :=
    (others => (others => False));
  follower_pos : array (Symbol_range, Symbol_range) of Follower_range;

  --  follower_pos(a,b) only significant if has_follower(a,b) = True

  procedure Show_partial_markov (ordered : Boolean) is
    subtype subrange is
      Symbol_range range Character'Pos ('a') .. Character'Pos ('i');
    sk, min, max : Symbol_range;
  begin
    if ordered then
      min := Follower_range'First;
      max := Follower_range'Last;
    else
      min := subrange'First;
      max := subrange'Last;
    end if;
    Ada.Text_IO.New_Line;
    for si in subrange loop
      Show_symbol (si);
      Ada.Text_IO.Put ("| ");
      for sj in min .. max loop
        if ordered then -- show top probas
          sk := order (si, sj);
        else -- show probas in the same subrange
          sk := sj;
        end if;
        Show_symbol (sk);
        Ada.Text_IO.Put (':' & Count'Image (markov_d (si, sj)) & ' ');
      end loop;
      if ordered then
        Ada.Text_IO.Put ("|" & Count'Image (total_row (si)) & Integer'Image (Slen (si)));
      end if;
      Ada.Text_IO.New_Line;
    end loop;
  end Show_partial_markov;

  procedure Build_Followers is

    procedure Swap (a, b : in out Count) is
      pragma Inline (Swap);
      c : Count;
    begin c := a; a := b; b := c; end Swap;

    procedure Swap (a, b : in out Symbol_range) is
      pragma Inline (Swap);
      c : Symbol_range;
    begin c := a; a := b; b := c; end Swap;

    --  Sort a row (sym_row) in the Markov matrix, largest probabilities first.
    --  Original order is kept by the order matrix.
    --
    procedure Sort (sym_row : Symbol_range) is
      --  A stupid bubble sort algo, but one working with sets
      --  having big packs of same data (trouble with qsort)...
      swapped : Boolean;
      left : Symbol_range := Symbol_range'First;
    begin
      loop
        swapped := False;
        for i in reverse left .. Symbol_range'Last - 1 loop
          if markov_d (sym_row, i) < markov_d (sym_row, i + 1) then
            Swap (markov_d (sym_row, i), markov_d (sym_row, i + 1));
            Swap (order   (sym_row, i), order   (sym_row, i + 1));
            swapped := True;
          end if;
        end loop;
        exit when not swapped;
        left := left + 1;
      end loop;
    end Sort;

    cumul : Count;
    follo : Symbol_range;

    subtype Bit_range is Integer range 0 .. 5;
    --  6 would be possible, PKWARE appnote.txt limits number
    --  of followers to 2**5

    max_follo : constant array (Bit_range) of Integer :=
      --  mostly 2**n - 1
      --  actual follower range is: 0..max_follo(bits)
       (0 => -1, -- NB: not 0; range is empty here
        1 =>  1,
        2 =>  3,
        3 =>  7,
        4 => 15,
        5 => 31
    --  6 => 62  -- NB: not 63
      );

    cumul_per_bit_length : array (Bit_range) of Count := (others => 0);

    subtype Real is Long_Float;

    exp_size, min_size : Real;
    bits_min : Bit_range;

  begin -- Build_Followers
    if not use_probas then
      Slen := (others => 0);
      return;
    end if;
    if trace then
      Show_partial_markov (False);
    end if;
    for si in Symbol_range loop
      total_row (si) := 0;
      for sj in Symbol_range loop
        order (si, sj) := sj;
        total_row (si) := total_row (si) + markov_d (si, sj);
      end loop;
      Sort (si);
      cumul := 0;
      --  Define all possible followers to symbol si:
      for sj in Follower_range loop
        cumul := cumul + markov_d (si, sj);
        cumul_per_bit_length (B_Table (sj + 1)) := cumul;
        --  ^ Overwritten several times. When sj jumps to the next bit length
        --    (say, bl+1), cumul_per_bit_length(bl) contains the amount of symbols
        --    for the stream to be encoded that can be emitted as immediate
        --    successors to symbol si by using follower shortcuts of bit length bl.
        follo := order (si, sj);
        Followers (si, sj) := follo;
        follower_pos (si, follo) := sj;
      end loop;
      --  Now we decide to which length we are using the followers
      min_size := Real (total_row (si)) * 8.0;
      --         ^ Size of all codes, in bits, when no follower
      --           at all is defined for symbol si. In the next
      --           loop, we will work to reduce that size.
      bits_min := 0;
      for bits in 1 .. Bit_range'Last loop
        --  We compute the exact size of reduced output (entire file!) for
        --  symbol si when cutting the follower range to 0..2**bits-1.
        exp_size :=
          --  Coded followers:
          Real (cumul_per_bit_length (bits))                  * Real (bits + 1) +
          --  All codes outside the follower list will take 8+1 bits:
          Real (total_row (si) - cumul_per_bit_length (bits)) * 9.0 +
          --  Also the follower list at the beginning takes place:
          Real (max_follo (bits) + 1) * 8.0;
        if exp_size < min_size then
          --  So far, it is more efficient to encode si's followers with 'bits' bits.
          min_size := exp_size;
          bits_min := bits;
        end if;
      end loop;
      Slen (si) := max_follo (bits_min) + 1;
      for sj in 0 .. max_follo (bits_min) loop
        has_follower (si, Followers (si, sj)) := True;
      end loop;
    end loop;
    if trace then
      Show_partial_markov (True);
    end if;
  end Build_Followers;

  --------------------------------
  -- LZ77 front-end compression --
  --------------------------------

  --  Cache for LZ-compressed data, to speedup the 2nd phase:

  LZ_cache_size : constant := 2**18; -- 256KB
  type LZ_buffer_range is mod LZ_cache_size;
  type LZ_buffer is array (LZ_buffer_range) of Byte; -- circular buffer

  type LZ_cache_type is record
    buf : LZ_buffer;             --  buf's index arithmetic is mod LZ_cache_size
    nxt : LZ_buffer_range := 0;  --  position of next byte to be written
    cnt : Natural := 0;          --  [0..size]: count of cached bytes
  end record;

  LZ_cache : LZ_cache_type;
  lz77_pos, lz77_size : Zip_64_Data_Size_Type := 0;

  --  Possible ranges for LZ distance and length encoding
  --  in the Zip-Reduce format:

  subtype Length_range is
    Integer range 4 .. 2**(8 - reduction_factor) + 257;

  subtype Distance_range is
    Integer range 1 .. (2**reduction_factor) * 256;

  --        max length  max dist
  --    1   385         512
  --    2   321         1024
  --    3   289         2048
  --    4   273         4096

  type Phase_type is (compute_stats, compress_for_real);

  generic
    phase : Phase_type;
  procedure Encode_with_Reduce;

  procedure Encode_with_Reduce is
    using_LZ77 : Boolean;
    Derail_LZ77 : exception;
    feedback_milestone,
    Bytes_in   : Zip_Streams.ZS_Size_Type := 0;   --  Count of input file bytes processed
    user_aborting : Boolean;
    real_pct : constant array (Phase_type) of Integer := (0, 50);
    PctDone : Natural;

    function Read_byte return Byte is
      b : Byte;
    begin
      b := IO_buffers.InBuf (IO_buffers.InBufIdx);
      IO_buffers.InBufIdx := IO_buffers.InBufIdx + 1;
      if phase = compute_stats then
        Zip.CRC_Crypto.Update (CRC, (1 => b));
      end if;
      Bytes_in := Bytes_in + 1;
      if feedback /= null then
        if Bytes_in = 1 then
          feedback (real_pct (phase), False, user_aborting);
        end if;
        if feedback_milestone > 0 and then
           ((Bytes_in - 1) mod feedback_milestone = 0
            or Bytes_in = ZS_Size_Type (input_size))
        then
          if input_size_known then
            PctDone := real_pct (phase) + Integer ((50.0 * Float (Bytes_in)) / Float (input_size));
            feedback (PctDone, False, user_aborting);
          else
            feedback (real_pct (phase), False, user_aborting);
          end if;
          if user_aborting then
            raise User_abort;
          end if;
        end if;
      end if;
      return b;
    end Read_byte;

    function More_bytes return Boolean is
    begin
      if IO_buffers.InBufIdx > IO_buffers.MaxInBufIdx then
        Read_Block (IO_buffers, input);
      end if;
      return not IO_buffers.InputEoF;
    end More_bytes;

    upper_shift : constant Integer := 2**(8 - reduction_factor);
    maximum_len_1 : constant Integer := upper_shift - 1;
    maximum_len_1_b : constant Byte := Byte (maximum_len_1);

    --  LZ77 params
    Look_redfac        : constant array (1 .. 4) of Integer := (31, 63, 255, 191);
    --  See za_work.xls, sheet Reduce, for the cooking of these numbers...
    Look_Ahead         : constant Integer := Look_redfac (reduction_factor);
    String_buffer_size : constant := 2**12;  --  2**n optimizes "mod" to "and"
    Threshold          : constant := 3;

    --  If the DLE coding doesn't fit the format constraints, we
    --  need to decode it as a simple sequence of literals
    --  before the probabilistic reduction.

    type Text_Buffer is array (0 .. String_buffer_size + Look_Ahead - 1) of Byte;
    Text_Buf : Text_Buffer;
    R : Natural;

    last_b : Symbol_range := 0;

    --  Raw byte: post LZ77 / DLE coding, pre probabilistic reduction
    procedure Write_raw_byte (b : Byte) is
      curr_b : constant Symbol_range := Symbol_range (b);
      follo : Boolean;
    begin
      lz77_pos := lz77_pos + 1;
      case phase is
        --
        when compute_stats =>
          markov_d (last_b, curr_b) := markov_d (last_b, curr_b) + 1;
          --  We also feed the cache which will be read at the 2nd phase:
          LZ_cache.buf (LZ_cache.nxt) := b;
          LZ_cache.nxt := LZ_cache.nxt + 1;
          LZ_cache.cnt := Natural'Min (LZ_cache_size, LZ_cache.cnt + 1);
        when compress_for_real =>  --  Probabilistic reduction
          if Slen (last_b) = 0 then
            --  Follower set is empty for this character.
            Put_code (b, 8);
          else
            follo := has_follower (last_b, curr_b);
            Put_code (1 - Boolean'Pos (follo), 1);
            --  ^ Certainly a weakness of this format is that each byte is preceded by
            --    a flag signaling "clear text" or compressed.
            if follo then
              Put_code (Byte (follower_pos (last_b, curr_b)), B_Table (Slen (last_b)));
            else
              Put_code (b, 8);
            end if;
          end if;
      end case;
      last_b := curr_b;
      if phase = compress_for_real and then
         using_LZ77 and then
         (lz77_size - lz77_pos) < Zip_64_Data_Size_Type (LZ_cache.cnt)
        --  We have entered the zone covered by the cache, so no need
        --  to continue the LZ77 compression effort: the results are
        --  already stored.
      then
        raise Derail_LZ77;
        --  We interrupt the LZ77 compression: data has been already
        --  cached upon first pass (phase = stats), no need to redo it.
      end if;
    end Write_raw_byte;

    --  The following procedures, Write_normal_byte and Write_DL_code,
    --  are called by the LZ77 compressor

    --  Write a normal, "clear-text", character
    procedure Write_normal_byte (b : Byte) is
    begin
      Write_raw_byte (b);
      if b = DLE_code then
        --  disambiguate situation where the character happens to have
        --  the same 'Pos as the DLE code
        Write_raw_byte (0);
      end if;
      Text_Buf (R) := b;
      R := (R + 1) mod String_buffer_size;
    end Write_normal_byte;

    --  Write a Distance-Length code
    procedure Write_DL_code (distance, length : Integer) is
      Copy_start : constant Natural := (R - distance) mod String_buffer_size;
      len : constant Integer := length - 3;
      dis : constant Integer := distance - 1;
      dis_upper : Byte;
    begin
      if distance in Distance_range and length in Length_range then
        Write_raw_byte (DLE_code);
        dis_upper := Byte ((dis / 256) * upper_shift);
        --  Encode length and upper part of distance
        if len < maximum_len_1 then
          Write_raw_byte (Byte (len) + dis_upper);
        else
          Write_raw_byte (maximum_len_1_b + dis_upper);
          Write_raw_byte (Byte (len - maximum_len_1));
        end if;
        --  Encode distance
        Write_raw_byte (Byte (dis mod 256));
        --  Expand in the circular text buffer to have it up to date
        for K in 0 .. length - 1 loop
          Text_Buf (R) := Text_Buf ((Copy_start + K) mod String_buffer_size);
          R := (R + 1) mod String_buffer_size;
        end loop;
      else
        --  Cannot encode this distance-length pair, then expand to output :-(
        --  if phase= compress then Put("Aie! (" & distance'img & length'img & ")"); end if;
        for K in 0 .. length - 1 loop
          Write_normal_byte (Text_Buf ((Copy_start + K) mod String_buffer_size));
        end loop;
      end if;
    end Write_DL_code;

    procedure Dummy_Estimate_DL_Codes (
      matches          : in out LZ77.Matches_Array;
      old_match_index  : in     Natural;
      prefixes         : in     LZ77.Byte_Array;
      best_score_index :    out Positive;
      best_score_set   :    out LZ77.Prefetch_Index_Type;
      match_trace      :    out LZ77.DLP_Array
    )
    is null;

    procedure My_LZ77 is
      new LZ77.Encode
               (String_buffer_size => String_buffer_size,
                Look_Ahead         => Look_Ahead,
                Threshold          => Threshold,
                Method             => LZ77.LZHuf,
                --  NB: Method IZ_9 needs exactly the same set of LZ77 parameters as in
                --      Deflate. Then the compression is worse, though much faster.
                Read_Byte          => Read_byte,
                More_Bytes         => More_bytes,
                Write_Literal      => Write_normal_byte,
                Write_DL_Code      => Write_DL_code,
                Estimate_DL_Codes  => Dummy_Estimate_DL_Codes);

    procedure Finish_Cache is
      i : LZ_buffer_range := LZ_buffer_range (lz77_pos mod LZ_cache_size);
    begin
      while lz77_pos < lz77_size loop
        Write_raw_byte (LZ_cache.buf (i));
        i := i + 1;
      end loop;
    end Finish_Cache;

  begin  --  Encode_with_Reduce
    Read_Block (IO_buffers, input);
    R := String_buffer_size - Look_Ahead;
    if input_size_known then
      feedback_milestone := ZS_Size_Type (input_size / feedback_steps);
    end if;
    using_LZ77 := True;
    My_LZ77;
  exception
    when Derail_LZ77 =>  --  LZ77 compression interrupted because compressed data already cached
      using_LZ77 := False;
      Finish_Cache;
      if feedback /= null then
        feedback (100, False, user_aborting);
      end if;
  end Encode_with_Reduce;

  procedure Build_stats   is new Encode_with_Reduce (phase => compute_stats);
  procedure Compress_data is new Encode_with_Reduce (phase => compress_for_real);

  mem : ZS_Index_Type;

begin
  Allocate_Buffers (IO_buffers, input_size_known, input_size);
  output_size := 0;
  mem := Index (input);
  --  Pass 1: statistics to calibrate the probabilistic expansion
  Build_stats;
  Set_Index (input, mem); -- go back to beginning of message to compress
  Build_Followers;
  --  Pass 2: actual compression
  Save_byte := 0;  --  Initialize output bit buffer
  Bits_used := 0;
  Save_Followers;  --  Emit the compression structure before the compressed message
  lz77_size := lz77_pos;
  lz77_pos := 0;
  begin
    Compress_data;  --  Emit the compressed message
    Flush_output;
    compression_ok := True;
  exception
    when Compression_inefficient =>
      compression_ok := False;
  end;
  Deallocate_Buffers (IO_buffers);
exception
  when others =>
    Deallocate_Buffers (IO_buffers);
    raise;
end Zip.Compress.Reduce;
Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other Ada projects on Gautier's blog.