Back to... Zip-Ada

Source file : zip-compress-deflate.adb



   1  --  Legal licensing note:
   2  
   3  --  Copyright (c) 2009 .. 2024 Gautier de Montmollin
   4  --  SWITZERLAND
   5  
   6  --  Permission is hereby granted, free of charge, to any person obtaining a copy
   7  --  of this software and associated documentation files (the "Software"), to deal
   8  --  in the Software without restriction, including without limitation the rights
   9  --  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  --  copies of the Software, and to permit persons to whom the Software is
  11  --  furnished to do so, subject to the following conditions:
  12  
  13  --  The above copyright notice and this permission notice shall be included in
  14  --  all copies or substantial portions of the Software.
  15  
  16  --  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  --  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  --  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  --  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  --  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  --  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  --  THE SOFTWARE.
  23  
  24  --  NB: this is the MIT License, as found on the site
  25  --  http://www.opensource.org/licenses/mit-license.php
  26  
  27  -----------------
  28  --  The "Deflate" method combines a LZ77 compression
  29  --  method with some Huffman encoding gymnastics.
  30  --
  31  --  Magic numbers in this procedure are adjusted through experimentation and marked with: *Tuned*
  32  --
  33  --  To do:
  34  --    - Taillaule: try with slider and/or initial lz window not centered
  35  --    - Taillaule: compare slider to random and fixed in addition to initial
  36  --    - Taillaule: try L_sup distance
  37  --    - Taillaule: restrict BL_Vector to short LZ distances (long distances perhaps too random)
  38  --    - Taillaule: check LZ vector norms on literals only, too (consider distances & lengths as noise)
  39  --    - Taillaule: use a corpus of files badly compressed by our Deflate comparatively
  40  --        to other Deflates (e.g. 7Z seems better with databases)
  41  --    - Add DeflOpt to slowest method, or approximate it by tweaking
  42  --        distance and length statistics before computing their Huffman codes, or
  43  --        reinvent it by computing the size of emitted codes and trying slight changes
  44  --        to the codes' bit lengths.
  45  --    - Improve LZ77 compression: see Zip.LZ77 to-do list; check with bypass_LZ77 below
  46  --        and various programs based on LZ77 using the trace >= some and the LZ77 dump
  47  --        in UnZip.Decompress.
  48  --    - Make this procedure standalone & generic like LZMA.Encoding;
  49  --        use it in the Zada project (Zlib replacement)
  50  --
  51  --  Change log:
  52  --------------
  53  --
  54  --  16-Mar-2016: Taillaule algorithm: first version ready for release.
  55  --  20-Feb-2016: (rev.305) Start of smarter techniques for "Dynamic" encoding: Taillaule algorithm
  56  --   4-Feb-2016: Start of "Dynamic" encoding format (compression structure sent before block)
  57  --
  58  --  19-Feb-2011: All distance and length codes implemented.
  59  --  18-Feb-2011: First version working with Deflate fixed and restricted distance & length codes.
  60  --  17-Feb-2011: Created (single-block, "fixed" Huffman encoding).
  61  
  62  with Huffman.Encoding.Length_Limited_Coding;
  63  with LZ77;
  64  
  65  with Ada.Text_IO,
  66       Ada.Unchecked_Deallocation;
  67  
  68  procedure Zip.Compress.Deflate
  69    (input,
  70     output           : in out Zip_Streams.Root_Zipstream_Type'Class;
  71     input_size_known :        Boolean;
  72     input_size       :        Zip_64_Data_Size_Type;  --  ignored if unknown
  73     feedback         :        Feedback_Proc;
  74     method           :        Deflation_Method;
  75     CRC              : in out Interfaces.Unsigned_32;  --  only updated here
  76     crypto           : in out CRC_Crypto.Crypto_pack;
  77     output_size      :    out Zip_64_Data_Size_Type;
  78     compression_ok   :    out Boolean)  --  indicates compressed < uncompressed
  79  is
  80    --  Options for testing.
  81    --  All should be on False for normal use of this procedure.
  82  
  83    deactivate_scanning : constant Boolean := False;  --  Impact analysis of the scanning method
  84    trace               : constant Boolean := False;  --  Log file with details
  85    trace_descriptors   : constant Boolean := False;  --  Additional logging of Huffman descriptors
  86  
  87    --  A log file is used when trace = True.
  88    log         : Ada.Text_IO.File_Type;
  89    log_name    : constant String := "Zip.Compress.Deflate.zcd";  --  A CSV with an unusual extension
  90    sep         : constant Character := ';';
  91  
  92    use Ada.Text_IO;
  93    use Interfaces;
  94  
  95    -------------------------------------
  96    -- Buffered I/O - byte granularity --
  97    -------------------------------------
  98  
  99    IO_buffers : IO_Buffers_Type;
 100  
 101    procedure Put_byte (B : Byte) is  --  Put a byte, at the byte granularity level
 102    pragma Inline (Put_byte);
 103    begin
 104      IO_buffers.OutBuf (IO_buffers.OutBufIdx) := B;
 105      IO_buffers.OutBufIdx := IO_buffers.OutBufIdx + 1;
 106      if IO_buffers.OutBufIdx > IO_buffers.OutBuf'Last then
 107        Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
 108      end if;
 109    end Put_byte;
 110  
 111    procedure Flush_byte_buffer is
 112    begin
 113      if IO_buffers.OutBufIdx > 1 then
 114        Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
 115      end if;
 116    end Flush_byte_buffer;
 117  
 118    ------------------------------------------------------
 119    --  Bit code buffer, for sending data at bit level  --
 120    ------------------------------------------------------
 121  
 122    --  Output buffer. Bits are inserted starting at the right (least
 123    --  significant bits). The width of bit_buffer must be at least 16 bits.
 124    subtype U32 is Unsigned_32;
 125    bit_buffer : U32 := 0;
 126    --  Number of valid bits in bit_buffer.  All bits above the last valid bit are always zero.
 127    valid_bits : Integer := 0;
 128  
 129    procedure Flush_bit_buffer is
 130    begin
 131      while valid_bits > 0 loop
 132        Put_byte (Byte (bit_buffer and 16#FF#));
 133        bit_buffer := Shift_Right (bit_buffer, 8);
 134        valid_bits := Integer'Max (0, valid_bits - 8);
 135      end loop;
 136      bit_buffer := 0;
 137    end Flush_bit_buffer;
 138  
 139    --  Bit codes are at most 15 bits for Huffman codes,
 140    --  or 13 for explicit codes (distance extra bits).
 141    subtype Code_Size_Type is Integer range 1 .. 15;
 142  
 143    --  Send a value on a given number of bits.
 144    procedure Put_Bits (code : U32; code_size : Code_Size_Type) with Inline is
 145    begin
 146      --  Put bits from code at the left of existing ones. They might be shifted away
 147      --  partially on the left side (or even entirely if valid_bits is already = 32).
 148      bit_buffer := bit_buffer or Shift_Left (code, valid_bits);
 149      valid_bits := valid_bits + code_size;
 150      if valid_bits > 32 then
 151        --  Flush 32 bits to output as 4 bytes
 152        Put_byte (Byte (bit_buffer and 16#FF#));
 153        Put_byte (Byte (Shift_Right (bit_buffer,  8) and 16#FF#));
 154        Put_byte (Byte (Shift_Right (bit_buffer, 16) and 16#FF#));
 155        Put_byte (Byte (Shift_Right (bit_buffer, 24) and 16#FF#));
 156        valid_bits := valid_bits - 32;
 157        --  Empty buffer and put on it the rest of the code
 158        bit_buffer := Shift_Right (code, code_size - valid_bits);
 159      end if;
 160    end Put_Bits;
 161  
 162    ------------------------------------------------------
 163    -- Deflate, post LZ encoding, with Huffman encoding --
 164    ------------------------------------------------------
 165  
 166    --  The Huffman code set (and therefore the Huffman tree) is completely determined by
 167    --  the bit length to be used for reaching leaf nodes, thanks to two special
 168    --  rules (explanation in RFC 1951, section 3.2.2).
 169    --
 170    --  So basically the process is the following:
 171    --
 172    --     (A) Gather statistics (just counts) for the alphabet.
 173    --     (B) Turn these counts into code lengths, by calling Length_limited_Huffman_code_lengths.
 174    --     (C) Build Huffman codes (the bits to be sent) with a call to Prepare_Huffman_codes.
 175    --
 176    --  In short:
 177    --
 178    --     data -> (A) -> stats -> (B) -> Huffman codes' bit lengths -> (C) -> Huffman codes
 179  
 180    type Bit_Length_Array is array (Natural range <>) of Natural;
 181  
 182    subtype Alphabet_lit_len is Natural range 0 .. 287;
 183    subtype Bit_length_array_lit_len is Bit_Length_Array (Alphabet_lit_len);
 184    subtype Alphabet_dis is Natural range 0 .. 31;
 185    subtype Bit_length_array_dis is Bit_Length_Array (Alphabet_dis);
 186  
 187    type Deflate_Huff_Descriptors is record
 188      --  Tree descriptor for Literal, EOB or Length encoding
 189      lit_len : Huffman.Encoding.Descriptor (0 .. 287);
 190      --  Tree descriptor for Distance encoding
 191      dis : Huffman.Encoding.Descriptor (0 .. 31);
 192    end record;
 193    --  NB: Appnote: "Literal codes 286-287 and distance codes 30-31 are never used
 194    --                  but participate in the Huffman construction."
 195    --  Setting upper bound to 285 for literals leads to invalid codes, sometimes.
 196  
 197    --  Copy bit length vectors into Deflate Huffman descriptors
 198  
 199    function Build_descriptors (
 200      bl_for_lit_len : Bit_length_array_lit_len;
 201      bl_for_dis     : Bit_length_array_dis
 202    )
 203    return Deflate_Huff_Descriptors
 204    is
 205      new_d : Deflate_Huff_Descriptors;
 206    begin
 207      for i in new_d.lit_len'Range loop
 208        new_d.lit_len (i) := (bit_length => bl_for_lit_len (i), code => Huffman.Encoding.invalid);
 209        if trace_descriptors and then trace and then Is_Open (log) then
 210          Put (log, Integer'Image (bl_for_lit_len (i)) & sep);
 211        end if;
 212      end loop;
 213      for i in new_d.dis'Range loop
 214        new_d.dis (i) := (bit_length => bl_for_dis (i), code => Huffman.Encoding.invalid);
 215        if trace_descriptors and then trace and then Is_Open (log) then
 216          Put (log, Integer'Image (bl_for_dis (i)) & sep);
 217        end if;
 218      end loop;
 219      if trace_descriptors and then trace and then Is_Open (log) then
 220        New_Line (log);
 221      end if;
 222      return new_d;
 223    end Build_descriptors;
 224  
 225    type Count_type is range 0 .. Zip_64_Data_Size_Type'Last / 2 - 1;
 226    type Stats_type is array (Natural range <>) of Count_type;
 227  
 228    --  The following is a translation of Zopfli's OptimizeHuffmanForRle (v. 11-May-2016).
 229    --  Possible gain: shorten the compression header containing the Huffman trees' bit lengths.
 230    --  Possible loss: since the stats do not correspond anymore exactly to the data
 231    --  to be compressed, the Huffman trees might be suboptimal.
 232    --
 233    --  Zopfli comment:
 234    --    Changes the population counts in a way that the consequent Huffman tree
 235    --    compression, especially its rle-part, will be more likely to compress this data
 236    --    more efficiently.
 237    --
 238    procedure Tweak_for_better_RLE (counts : in out Stats_type) is
 239      length : Integer := counts'Length;
 240      stride : Integer;
 241      symbol, sum, limit, new_count : Count_type;
 242      good_for_rle : array (counts'Range) of Boolean := (others => False);
 243    begin
 244      --  1) We don't want to touch the trailing zeros. We may break the
 245      --     rules of the format by adding more data in the distance codes.
 246      loop
 247        if length = 0 then
 248          return;
 249        end if;
 250        exit when counts (length - 1) /= 0;
 251        length := length - 1;
 252      end loop;
 253      --  Now counts(0..length - 1) does not have trailing zeros.
 254      --
 255      --  2) Let's mark all population counts that already can be encoded with an rle code.
 256      --
 257      --  Let's not spoil any of the existing good rle codes.
 258      --  Mark any seq of 0's that is longer than 5 as a good_for_rle.
 259      --  Mark any seq of non-0's that is longer than 7 as a good_for_rle.
 260      symbol := counts (0);
 261      stride := 0;
 262      for i in 0 .. length loop
 263        if i = length or else counts (i) /= symbol then
 264          if (symbol = 0 and then stride >= 5) or else (symbol /= 0 and then stride >= 7) then
 265            for k in 0 .. stride - 1 loop
 266              good_for_rle (i - k - 1) := True;
 267            end loop;
 268          end if;
 269          stride := 1;
 270          if i /= length then
 271            symbol := counts (i);
 272          end if;
 273        else
 274          stride := stride + 1;
 275        end if;
 276      end loop;
 277      --  3) Let's replace those population counts that lead to more rle codes.
 278      stride := 0;
 279      limit := counts (0);
 280      sum := 0;
 281      for i in 0 .. length loop
 282        if i = length or else good_for_rle (i)
 283            or else (i > 0 and then good_for_rle (i - 1))  --  Added from Brotli, item #1
 284            --  Heuristic for selecting the stride ranges to collapse.
 285            or else abs (counts (i) - limit) >= 4
 286        then
 287          if stride >= 4 or else (stride >= 3 and then sum = 0) then
 288            --  The stride must end, collapse what we have, if we have enough (4).
 289            --  GdM: new_count is the average of counts on the stride's interval, upper-rounded.
 290            new_count := Count_type'Max (1, (sum + Count_type (stride) / 2) / Count_type (stride));
 291            if sum = 0 then
 292              --  Don't make an all zeros stride to be upgraded to ones.
 293              new_count := 0;
 294            end if;
 295            for k in 0 .. stride - 1 loop
 296              --  We don't want to change value at counts(i),
 297              --  that is already belonging to the next stride. Thus - 1.
 298              counts (i - k - 1) := new_count;  --  GdM: Replace histogram value by averaged value.
 299            end loop;
 300          end if;
 301          stride := 0;
 302          sum := 0;
 303          if i < length - 3 then
 304            --  All interesting strides have a count of at least 4, at least when non-zeros.
 305            --  GdM: limit is the average of next 4 counts, upper-rounded.
 306            limit := (counts (i) + counts (i + 1) + counts (i + 2) + counts (i + 3) + 2) / 4;
 307          elsif i < length then
 308            limit := counts (i);
 309          else
 310            limit := 0;
 311          end if;
 312        end if;
 313        stride := stride + 1;
 314        if i /= length then
 315          sum := sum + counts (i);
 316        end if;
 317      end loop;
 318    end Tweak_for_better_RLE;
 319  
 320    subtype Stats_lit_len_type is Stats_type (Alphabet_lit_len);
 321    subtype Stats_dis_type is Stats_type (Alphabet_dis);
 322  
 323    --  Phase (B) : we turn statistics into Huffman bit lengths.
 324    function Build_descriptors (
 325      stats_lit_len  : Stats_lit_len_type;
 326      stats_dis      : Stats_dis_type
 327    )
 328    return Deflate_Huff_Descriptors
 329    is
 330      bl_for_lit_len : Bit_length_array_lit_len;
 331      bl_for_dis     : Bit_length_array_dis;
 332      procedure LLHCL_lit_len is new
 333        Huffman.Encoding.Length_Limited_Coding
 334          (Alphabet_lit_len, Count_type, Stats_lit_len_type, Bit_length_array_lit_len, 15);
 335      procedure LLHCL_dis is new
 336        Huffman.Encoding.Length_Limited_Coding
 337          (Alphabet_dis, Count_type, Stats_dis_type, Bit_length_array_dis, 15);
 338      stats_dis_copy : Stats_dis_type := stats_dis;
 339      --
 340      procedure Patch_statistics_for_buggy_decoders is
 341        --  Workaround for buggy Info-Zip decoder versions.
 342        --  See "PatchDistanceCodesForBuggyDecoders" in Zopfli's deflate.c
 343        --  NB: here, we patch the statistics and not the resulting bit lengths,
 344        --      to be sure we avoid invalid Huffman code sets in the end.
 345        --  The decoding bug concerns Zlib v.<= 1.2.1, UnZip v.<= 6.0, WinZip v.<=10.0.
 346        used : Natural := 0;
 347      begin
 348        for i in stats_dis_copy'Range loop
 349          if stats_dis_copy (i) /= 0 then
 350            used := used + 1;
 351          end if;
 352        end loop;
 353        case used is
 354          when 0 =>  --  No distance code used at all (data must be almost random).
 355            stats_dis_copy (0 .. 1) := (1, 1);
 356          when 1 =>
 357            if stats_dis_copy (0) = 0 then
 358              stats_dis_copy (0) := 1;  --  Now, code 0 and some other code have non-zero counts.
 359            else
 360              stats_dis_copy (1) := 1;  --  Now, codes 0 and 1 have non-zero counts.
 361            end if;
 362          when others =>
 363            null;  --  No workaround needed when 2 or more distance codes are defined.
 364        end case;
 365      end Patch_statistics_for_buggy_decoders;
 366    begin
 367      Patch_statistics_for_buggy_decoders;
 368      LLHCL_lit_len (stats_lit_len, bl_for_lit_len);  --  Call the magic algorithm for setting
 369      LLHCL_dis (stats_dis_copy, bl_for_dis);         --    up Huffman lengths of both trees
 370      return Build_descriptors (bl_for_lit_len, bl_for_dis);
 371    end Build_descriptors;
 372  
 373    --  Here is one original part in the Taillaule algorithm: use of basic
 374    --  topology (L1, L2 distances) to check similarities between Huffman code sets.
 375  
 376    --  Bit length vector. Convention: 16 is unused bit length (close to the bit length for the
 377    --  rarest symbols, 15, and far from the bit length for the most frequent symbols, 1).
 378    --  Deflate uses 0 for unused.
 379    subtype BL_code is Integer_M32 range 1 .. 16;
 380    type BL_vector is array (1 .. 288 + 32) of BL_code;
 381  
 382    function Convert (h : Deflate_Huff_Descriptors) return BL_vector is
 383      bv : BL_vector;
 384      j : Positive := 1;
 385    begin
 386      for i in h.lit_len'Range loop
 387        if h.lit_len (i).bit_length = 0 then
 388          bv (j) := 16;
 389        else
 390          bv (j) := Integer_M32 (h.lit_len (i).bit_length);
 391        end if;
 392        j := j + 1;
 393      end loop;
 394      for i in h.dis'Range loop
 395        if h.dis (i).bit_length = 0 then
 396          bv (j) := 16;
 397        else
 398          bv (j) := Integer_M32 (h.dis (i).bit_length);
 399        end if;
 400        j := j + 1;
 401      end loop;
 402      return bv;
 403    end Convert;
 404  
 405    --  L1 or Manhattan distance
 406    function L1_distance (b1, b2 : BL_vector) return Natural_M32 is
 407      s : Natural_M32 := 0;
 408    begin
 409      for i in b1'Range loop
 410        s := s + abs (b1 (i) - b2 (i));
 411      end loop;
 412      return s;
 413    end L1_distance;
 414  
 415    --  L1, tweaked
 416    --
 417    tweak : constant array (BL_code) of Positive_M32 :=
 418      --  For the origin of the tweak function, see "za_work.xls", sheet "Deflate".
 419      --  function f3 = 0.20 f1 [logarithmic] + 0.80 * identity
 420      --  NB: all values are multiplied by 100 for accuracy.
 421          (100, 255, 379, 490, 594, 694, 791, 885, 978, 1069, 1159, 1249, 1338, 1426, 1513, 1600);
 422      --  Neutral is:
 423      --  (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600)
 424  
 425    --
 426    function L1_tweaked (b1, b2 : BL_vector) return Natural_M32 is
 427      s : Natural_M32 := 0;
 428    begin
 429      for i in b1'Range loop
 430        s := s + abs (tweak (b1 (i)) - tweak (b2 (i)));
 431      end loop;
 432      return s;
 433    end L1_tweaked;
 434  
 435    --  L2 or Euclidean distance
 436    function L2_distance_square (b1, b2 : BL_vector) return Natural_M32 is
 437      s : Natural_M32 := 0;
 438    begin
 439      for i in b1'Range loop
 440        s := s + (b1 (i) - b2 (i)) ** 2;
 441      end loop;
 442      return s;
 443    end L2_distance_square;
 444  
 445    --  L2, tweaked
 446    function L2_tweaked_square (b1, b2 : BL_vector) return Natural_M32 is
 447      s : Natural_M32 := 0;
 448    begin
 449      for i in b1'Range loop
 450        s := s + (tweak (b1 (i)) - tweak (b2 (i))) ** 2;
 451      end loop;
 452      return s;
 453    end L2_tweaked_square;
 454  
 455    type Distance_type is (L1, L1_tweaked, L2, L2_tweaked);
 456  
 457    function Similar
 458      (h1, h2    : Deflate_Huff_Descriptors;
 459       dist_kind : Distance_type;
 460       threshold : Natural;
 461       comment   : String)
 462    return Boolean
 463    is
 464      dist  : Natural_M32;
 465      thres : Natural_M32 := Natural_M32 (threshold);
 466    begin
 467      case dist_kind is
 468        when L1 =>
 469          dist := L1_distance (Convert (h1), Convert (h2));
 470        when L1_tweaked =>
 471          thres := thres * tweak (1);
 472          dist  := L1_tweaked (Convert (h1), Convert (h2));
 473        when L2 =>
 474          thres := thres * thres;
 475          dist  := L2_distance_square (Convert (h1), Convert (h2));
 476        when L2_tweaked =>
 477          thres := (thres * thres) * (tweak (1) * tweak (1));
 478          dist  := L2_tweaked_square (Convert (h1), Convert (h2));
 479      end case;
 480      if trace then
 481        Put_Line (log,
 482          "Checking similarity." & sep &
 483          Distance_type'Image (dist_kind) & sep &
 484          "Distance (ev. x100, **2):" & sep & Integer_M32'Image (dist) & sep & sep &
 485          "Threshold (ev. x100, **2):" & sep & Integer_M32'Image (thres) & sep & sep &
 486          comment
 487        );
 488      end if;
 489      return dist < thres;
 490    end Similar;
 491  
 492    --  Another original part in the Taillaule algorithm: the possibility of recycling
 493    --  Huffman codes. It is possible only if previous block was not stored and if
 494    --  the new block's used alphabets are included in the old block's used alphabets.
 495    function Recyclable (h_old, h_new : Deflate_Huff_Descriptors) return Boolean is
 496    begin
 497      for i in h_old.lit_len'Range loop
 498        if h_old.lit_len (i).bit_length = 0 and h_new.lit_len (i).bit_length > 0 then
 499          return False;  --  Code used in new, but not in old
 500        end if;
 501      end loop;
 502      for i in h_old.dis'Range loop
 503        if h_old.dis (i).bit_length = 0 and h_new.dis (i).bit_length > 0 then
 504          return False;  --  Code used in new, but not in old
 505        end if;
 506      end loop;
 507      return True;
 508    end Recyclable;
 509  
 510    --  Phase (C): the Prepare_Huffman_Codes procedure finds the Huffman code
 511    --  for each value, given the bit length imposed as input.
 512  
 513    function Prepare_Huffman_Codes (dhd : Deflate_Huff_Descriptors) return Deflate_Huff_Descriptors
 514    is
 515      dhd_var : Deflate_Huff_Descriptors := dhd;
 516    begin
 517      Huffman.Encoding.Prepare_Codes (dhd_var.lit_len, Code_Size_Type'Last, True);
 518      Huffman.Encoding.Prepare_Codes (dhd_var.dis, Code_Size_Type'Last, True);
 519      return dhd_var;
 520    end Prepare_Huffman_Codes;
 521  
 522    --  Emit a variable length Huffman code
 523    procedure Put_Huffman_Code (lc : Huffman.Encoding.Length_Code_Pair) is
 524    pragma Inline (Put_Huffman_Code);
 525    begin
 526      --  Huffman code of length 0 should never occur: when constructing
 527      --  the code lengths (LLHCL) any single occurrence in the statistics
 528      --  will trigger the build of a code length of 1 or more.
 529      Put_Bits
 530        (code      => U32 (lc.code),
 531         code_size => Code_Size_Type (lc.bit_length));  --  Range check for length 0 (if enabled).
 532    end Put_Huffman_Code;
 533  
 534    --  This is where the "dynamic" Huffman trees are sent before the block's data are sent.
 535    --
 536    --  The decoder needs to know in advance the pair of trees (1st tree for literals-eob-LZ
 537    --  lengths, 2nd tree for LZ distances) for decoding the compressed data.
 538    --  But this information takes some room. Fortunately Deflate allows for compressing it
 539    --  with a combination of Huffman and Run-Length Encoding (RLE) to make this header smaller.
 540    --  Concretely, the trees are described by the bit length of each symbol, so the header's
 541    --  content is a vector of length max 320, whose contents are in the 0 .. 18 range and typically
 542    --  look like:  ... 8, 8, 9, 7, 8, 10, 6, 8, 8, 8, 8, 8, 11, 8, 9, 8, ...
 543    --  Clearly this vector has redundancies and can be sent in a compressed form.
 544    --  In this example, the RLE will compress the string of 8's with a single code 8, then a code 17
 545    --  (repeat x times). Anyway, the very frequent 8's will be encoded with a small number of
 546    --  bits (less than the 5 plain bits, or maximum 7 Huffman-encoded bits
 547    --  needed for encoding integers in the 0 .. 18 range).
 548    --
 549    procedure Put_Compression_Structure
 550      (dhd           :        Deflate_Huff_Descriptors;
 551       cost_analysis :        Boolean;     --  If True: just simulate the whole, and count needed bits
 552       bits          : in out Count_type)  --  This is incremented when cost_analysis = True
 553    is
 554      subtype Alphabet is Integer range 0 .. 18;
 555      type Alpha_Array is new Bit_Length_Array (Alphabet);
 556      truc_freq, truc_bl : Alpha_Array;
 557      truc : Huffman.Encoding.Descriptor (Alphabet);
 558      --  Compression structure: cs_bl is the "big" array with all bit lengths
 559      --  for compressing data. cs_bl will be sent compressed, too.
 560      cs_bl : array (1 .. dhd.lit_len'Length + dhd.dis'Length) of Natural;
 561      last_cs_bl : Natural;
 562      max_used_lln_code : Alphabet_lit_len := 0;
 563      max_used_dis_code : Alphabet_dis     := 0;
 564      --
 565      procedure Concatenate_all_bit_lengths is
 566        idx : Natural := 0;
 567      begin
 568        for a in reverse Alphabet_lit_len loop
 569          if dhd.lit_len (a).bit_length > 0 then
 570            max_used_lln_code := a;
 571            exit;
 572          end if;
 573        end loop;
 574        for a in reverse Alphabet_dis loop
 575          if dhd.dis (a).bit_length > 0 then
 576            max_used_dis_code := a;
 577            exit;
 578          end if;
 579        end loop;
 580        --  Copy bit lengths for both trees into one array, cs_bl.
 581        for a in 0 .. max_used_lln_code loop
 582          idx := idx + 1;
 583          cs_bl (idx) := dhd.lit_len (a).bit_length;
 584        end loop;
 585        for a in 0 .. max_used_dis_code loop
 586          idx := idx + 1;
 587          cs_bl (idx) := dhd.dis (a).bit_length;
 588        end loop;
 589        last_cs_bl := idx;
 590      end Concatenate_all_bit_lengths;
 591      --
 592      extra_bits_needed : constant array (Alphabet) of Natural :=
 593         (16 => 2, 17 => 3, 18 => 7, others => 0);
 594      --
 595      type Emission_mode is (simulate, effective);
 596      --
 597      procedure Emit_data_compression_structures (emit_mode : Emission_mode) is
 598        procedure Emit_data_compression_atom (x : Alphabet; extra_code : U32 := 0) is
 599          --  x is a bit length (value in 0..15), or a RLE instruction
 600        begin
 601          case emit_mode is
 602            when simulate =>
 603              truc_freq (x) := truc_freq (x) + 1;  --  +1 for x's histogram bar
 604            when effective =>
 605              Put_Huffman_Code (truc (x));
 606              declare
 607                extra_bits : constant Natural := extra_bits_needed (x);
 608              begin
 609                if extra_bits > 0 then
 610                  Put_Bits (extra_code, extra_bits);
 611                end if;
 612              end;
 613          end case;
 614        end Emit_data_compression_atom;
 615        idx : Natural := 0;
 616        rep : Positive;  --  Number of times current atom is repeated, >= 1
 617      begin
 618        --  Emit the bit lengths, with some RLE encoding (Appnote: 5.5.3; RFC 1951: 3.2.7)
 619        idx := 1;
 620        loop
 621          rep := 1;  --  Current atom, cs_bl(idx), is repeated 1x so far - obvious, isn't it ?
 622          for j in idx + 1 .. last_cs_bl loop
 623            exit when cs_bl (j) /= cs_bl (idx);
 624            rep := rep + 1;
 625          end loop;
 626          --  Now rep is the number of repetitions of current atom, including itself.
 627          if idx > 1 and then cs_bl (idx) = cs_bl (idx - 1) and then rep >= 3
 628              --  Better repeat a long sequence of zeros by using codes 17 or 18
 629              --  just after a 138-long previous sequence:
 630            and then not (cs_bl (idx) = 0 and then rep > 6)
 631          then
 632            rep := Integer'Min (rep, 6);
 633            Emit_data_compression_atom (16, U32 (rep - 3));    -- 16: "Repeat previous 3 to 6 times"
 634            idx := idx + rep;
 635          elsif cs_bl (idx) = 0 and then rep >= 3 then
 636            --  The 0 bit length may occur on long ranges of an alphabet (unused symbols)
 637            if rep <= 10 then
 638              Emit_data_compression_atom (17, U32 (rep - 3));  -- 17: "Repeat zero 3 to 10 times"
 639            else
 640              rep := Integer'Min (rep, 138);
 641              Emit_data_compression_atom (18, U32 (rep - 11)); -- 18: "Repeat zero 11 to 138 times"
 642            end if;
 643            idx := idx + rep;
 644          else
 645            Emit_data_compression_atom (cs_bl (idx));
 646            idx := idx + 1;
 647          end if;
 648          exit when idx > last_cs_bl;
 649        end loop;
 650      end Emit_data_compression_structures;
 651      --  Alphabet permutation for shortening in-use alphabet.
 652      --  After the RLE codes 16, 17, 18 and the bit length 0, which are assumed to be always used,
 653      --  the most usual bit lengths (around 8, which is the "neutral" bit length) appear first.
 654      --  For example, if the rare bit lengths 1 and 15 don't occur in any of the two Huffman trees
 655      --  for LZ data, then codes 1 and 15 have a length 0 in the local Alphabet and we can omit
 656      --  sending the last two bit lengths.
 657      alphabet_permutation : constant array (Alphabet) of Natural :=
 658         (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
 659      procedure LLHCL is new
 660        Huffman.Encoding.Length_Limited_Coding (Alphabet, Natural, Alpha_Array, Alpha_Array, 7);
 661      a_non_zero : Alphabet;
 662    begin
 663      Concatenate_all_bit_lengths;
 664      truc_freq := (others => 0);
 665      Emit_data_compression_structures (simulate);
 666      --  We have now statistics of all bit lengths occurrences of both Huffman
 667      --  trees used for compressing the data.
 668      --  We turn these counts into bit lengths for the local tree
 669      --  that helps us to store the compression structure in a more compact form.
 670      LLHCL (truc_freq, truc_bl);  --  Call the magic algorithm for setting up Huffman lengths
 671      --  At least lengths for codes 16, 17, 18, 0 will always be sent,
 672      --  even if all other bit lengths are 0 because codes 1 to 15 are unused.
 673      a_non_zero := 3;
 674      for a in Alphabet loop
 675        if a > a_non_zero and then truc_bl (alphabet_permutation (a)) > 0 then
 676          a_non_zero := a;
 677        end if;
 678      end loop;
 679      if cost_analysis then
 680        --  In this mode, no data output: we sum up the exact
 681        --  number of bits needed by the compression header.
 682        bits := bits + 14 + Count_type (1 + a_non_zero) * 3;
 683        for a in Alphabet loop
 684          bits := bits + Count_type (truc_freq (a) * (truc_bl (a) + extra_bits_needed (a)));
 685        end loop;
 686      else
 687        --  We output the compression header to the output stream.
 688        for a in Alphabet loop
 689          truc (a).bit_length := truc_bl (a);
 690        end loop;
 691        Huffman.Encoding.Prepare_Codes (truc, Code_Size_Type'Last, True);
 692        --  Output of the compression structure
 693        Put_Bits (U32 (max_used_lln_code - 256), 5);  --  max_used_lln_code is always >= 256 = EOB code
 694        Put_Bits (U32 (max_used_dis_code), 5);
 695        Put_Bits (U32 (a_non_zero - 3), 4);
 696        --  Save the local alphabet's Huffman lengths. It's the compression structure
 697        --  for compressing the data compression structure. Easy, isn't it ?
 698        for a in 0 .. a_non_zero loop
 699          Put_Bits (U32 (truc (alphabet_permutation (a)).bit_length), 3);
 700        end loop;
 701        --  Emit the Huffman lengths for encoding the data, in the local Huffman-encoded fashion.
 702        Emit_data_compression_structures (effective);
 703      end if;
 704    end Put_Compression_Structure;
 705  
 706    End_Of_Block : constant := 256;
 707  
 708    --  Default Huffman trees, for "fixed" blocks, as defined in appnote.txt or RFC 1951
 709    default_lit_len_bl : constant Bit_length_array_lit_len :=
 710        (0 .. 143   => 8,  --  For literals ("plain text" bytes)
 711       144 .. 255   => 9,  --  For more literals ("plain text" bytes)
 712       End_Of_Block => 7,  --  For EOB (256)
 713       257 .. 279   => 7,  --  For length codes
 714       280 .. 287   => 8   --  For more length codes
 715      );
 716    default_dis_bl : constant Bit_length_array_dis := (others => 5);
 717  
 718    Deflate_fixed_descriptors : constant Deflate_Huff_Descriptors :=
 719      Prepare_Huffman_Codes (Build_descriptors (default_lit_len_bl, default_dis_bl));
 720  
 721    --  Current tree descriptors
 722    curr_descr : Deflate_Huff_Descriptors := Deflate_fixed_descriptors;
 723  
 724    --  Write a normal, "clear-text" (post LZ, pre Huffman), 8-bit character (literal)
 725    procedure Put_literal_byte (b : Byte) is
 726    begin
 727      Put_Huffman_Code (curr_descr.lit_len (Integer (b)));
 728    end Put_literal_byte;
 729  
 730    --  Possible ranges for distance and length encoding in the Zip-Deflate format:
 731    subtype Length_range is Integer range 3 .. 258;
 732    subtype Distance_range is Integer range 1 .. 32768;
 733  
 734    --  This is where LZ distance-length tokens are written to the output stream.
 735    --  The Deflate format defines a sort of logarithmic compression, with codes
 736    --  for various distance and length ranges, plus extra bits for specifying the
 737    --  exact values. The codes are sent as Huffman codes with variable bit lengths
 738    --  (nothing to do with the lengths of LZ distance-length tokens).
 739  
 740    --                              Length Codes
 741    --                              ------------
 742    --       Extra             Extra              Extra              Extra
 743    --  Code Bits Length  Code Bits Lengths  Code Bits Lengths  Code Bits Length(s)
 744    --  ---- ---- ------  ---- ---- -------  ---- ---- -------  ---- ---- ---------
 745    --   257   0     3     265   1   11,12    273   3   35-42    281   5  131-162
 746    --   258   0     4     266   1   13,14    274   3   43-50    282   5  163-194
 747    --   259   0     5     267   1   15,16    275   3   51-58    283   5  195-226
 748    --   260   0     6     268   1   17,18    276   3   59-66    284   5  227-257
 749    --   261   0     7     269   2   19-22    277   4   67-82    285   0    258
 750    --   262   0     8     270   2   23-26    278   4   83-98
 751    --   263   0     9     271   2   27-30    279   4   99-114
 752    --   264   0    10     272   2   31-34    280   4  115-130
 753    --
 754    --  Example: the code # 266 means the LZ length (# of message bytes to be copied)
 755    --           shall be 13 or 14, depending on the extra bit value.
 756  
 757    deflate_code_for_lz_length : constant array (Length_range) of Natural :=
 758        (3          => 257,  -- Codes 257..264, with no extra bit
 759         4          => 258,
 760         5          => 259,
 761         6          => 260,
 762         7          => 261,
 763         8          => 262,
 764         9          => 263,
 765         10         => 264,
 766         11  .. 12  => 265,  -- Codes 265..268, with 1 extra bit
 767         13  .. 14  => 266,
 768         15  .. 16  => 267,
 769         17  .. 18  => 268,
 770         19  .. 22  => 269,  -- Codes 269..272, with 2 extra bits
 771         23  .. 26  => 270,
 772         27  .. 30  => 271,
 773         31  .. 34  => 272,
 774         35  .. 42  => 273,  -- Codes 273..276, with 3 extra bits
 775         43  .. 50  => 274,
 776         51  .. 58  => 275,
 777         59  .. 66  => 276,
 778         67  .. 82  => 277,  -- Codes 277..280, with 4 extra bits
 779         83  .. 98  => 278,
 780         99  .. 114 => 279,
 781         115 .. 130 => 280,
 782         131 .. 162 => 281,  -- Codes 281..284, with 5 extra bits
 783         163 .. 194 => 282,
 784         195 .. 226 => 283,
 785         227 .. 257 => 284,
 786         258        => 285   -- Code 285, with no extra bit
 787       );
 788  
 789    extra_bits_for_lz_length_offset : constant array (Length_range) of Integer :=
 790        (3 ..  10 | 258 =>  Huffman.Encoding.invalid,  --  just a placeholder, there is no extra bit there!
 791        11 ..  18       =>  11,
 792        19 ..  34       =>  19,
 793        35 ..  66       =>  35,
 794        67 .. 130       =>  67,
 795       131 .. 257       => 131);
 796  
 797    extra_bits_for_lz_length : constant array (Length_range) of Natural :=
 798        (3 ..  10 | 258 => 0,
 799        11 ..  18       => 1,
 800        19 ..  34       => 2,
 801        35 ..  66       => 3,
 802        67 .. 130       => 4,
 803       131 .. 257       => 5);
 804  
 805    procedure Put_DL_code (distance : Distance_range; length : Length_range) is
 806      extra_bits : Natural;
 807    begin
 808      Put_Huffman_Code (curr_descr.lit_len (deflate_code_for_lz_length (length)));
 809      --  Extra bits are needed to differentiate lengths sharing the same code.
 810      extra_bits := extra_bits_for_lz_length (length);
 811      if extra_bits > 0 then
 812        --  We keep only the last extra_bits bits of the length (minus given offset).
 813        --  Example: if extra_bits = 1, only the parity is sent (0 or 1);
 814        --  the rest has been already sent with Put_Huffman_code above.
 815        --  Equivalent: x:= x mod (2 ** extra_bits);
 816        Put_Bits (
 817          U32 (length - extra_bits_for_lz_length_offset (length))
 818            and
 819          (Shift_Left (U32'(1), extra_bits) - 1),
 820          extra_bits);
 821      end if;
 822      --                             Distance Codes
 823      --                             --------------
 824      --       Extra           Extra             Extra               Extra
 825      --  Code Bits Dist  Code Bits  Dist   Code Bits Distance  Code Bits Distance
 826      --  ---- ---- ----  ---- ---- ------  ---- ---- --------  ---- ---- --------
 827      --    0   0    1      8   3   17-24    16    7  257-384    24   11  4097-6144
 828      --    1   0    2      9   3   25-32    17    7  385-512    25   11  6145-8192
 829      --    2   0    3     10   4   33-48    18    8  513-768    26   12  8193-12288
 830      --    3   0    4     11   4   49-64    19    8  769-1024   27   12 12289-16384
 831      --    4   1   5,6    12   5   65-96    20    9 1025-1536   28   13 16385-24576
 832      --    5   1   7,8    13   5   97-128   21    9 1537-2048   29   13 24577-32768
 833      --    6   2   9-12   14   6  129-192   22   10 2049-3072
 834      --    7   2  13-16   15   6  193-256   23   10 3073-4096
 835      --
 836      --
 837      --  Example: the code # 10 means the LZ distance (# positions back in the circular
 838      --           message buffer for starting the copy) shall be 33, plus the value given
 839      --           by the 4 extra bits (between 0 and 15).
 840      --
 841      case distance is
 842        when 1 .. 4 =>          --  Codes 0..3, with no extra bit
 843          Put_Huffman_Code (curr_descr.dis (distance - 1));
 844        when 5 .. 8 =>          --  Codes 4..5, with 1 extra bit
 845          Put_Huffman_Code (curr_descr.dis (4 + (distance - 5) / 2));
 846          Put_Bits (U32 ((distance - 5) mod 2), 1);
 847        when 9 .. 16 =>         --  Codes 6..7, with 2 extra bits
 848          Put_Huffman_Code (curr_descr.dis (6 + (distance - 9) / 4));
 849          Put_Bits (U32 ((distance - 9) mod 4), 2);
 850        when 17 .. 32 =>        --  Codes 8..9, with 3 extra bits
 851          Put_Huffman_Code (curr_descr.dis (8 + (distance - 17) / 8));
 852          Put_Bits (U32 ((distance - 17) mod 8), 3);
 853        when 33 .. 64 =>        --  Codes 10..11, with 4 extra bits
 854          Put_Huffman_Code (curr_descr.dis (10 + (distance - 33) / 16));
 855          Put_Bits (U32 ((distance - 33) mod 16), 4);
 856        when 65 .. 128 =>       --  Codes 12..13, with 5 extra bits
 857          Put_Huffman_Code (curr_descr.dis (12 + (distance - 65) / 32));
 858          Put_Bits (U32 ((distance - 65) mod 32), 5);
 859        when 129 .. 256 =>      --  Codes 14..15, with 6 extra bits
 860          Put_Huffman_Code (curr_descr.dis (14 + (distance - 129) / 64));
 861          Put_Bits (U32 ((distance - 129) mod 64), 6);
 862        when 257 .. 512 =>      --  Codes 16..17, with 7 extra bits
 863          Put_Huffman_Code (curr_descr.dis (16 + (distance - 257) / 128));
 864          Put_Bits (U32 ((distance - 257) mod 128), 7);
 865        when 513 .. 1024 =>     --  Codes 18..19, with 8 extra bits
 866          Put_Huffman_Code (curr_descr.dis (18 + (distance - 513) / 256));
 867          Put_Bits (U32 ((distance - 513) mod 256), 8);
 868        when 1025 .. 2048 =>    --  Codes 20..21, with 9 extra bits
 869          Put_Huffman_Code (curr_descr.dis (20 + (distance - 1025) / 512));
 870          Put_Bits (U32 ((distance - 1025) mod 512), 9);
 871        when 2049 .. 4096 =>    --  Codes 22..23, with 10 extra bits
 872          Put_Huffman_Code (curr_descr.dis (22 + (distance - 2049) / 1024));
 873          Put_Bits (U32 ((distance - 2049) mod 1024), 10);
 874        when 4097 .. 8192 =>    --  Codes 24..25, with 11 extra bits
 875          Put_Huffman_Code (curr_descr.dis (24 + (distance - 4097) / 2048));
 876          Put_Bits (U32 ((distance - 4097) mod 2048), 11);
 877        when 8193 .. 16384 =>   --  Codes 26..27, with 12 extra bits
 878          Put_Huffman_Code (curr_descr.dis (26 + (distance - 8193) / 4096));
 879          Put_Bits (U32 ((distance - 8193) mod 4096), 12);
 880        when 16385 .. 32768 =>  --  Codes 28..29, with 13 extra bits
 881          Put_Huffman_Code (curr_descr.dis (28 + (distance - 16385) / 8192));
 882          Put_Bits (U32 ((distance - 16385) mod 8192), 13);
 883      end case;
 884    end Put_DL_code;
 885  
 886    function Deflate_code_for_LZ_distance (distance : Distance_range) return Natural is
 887    begin
 888      case distance is
 889        when 1 .. 4 => -- Codes 0..3, with no extra bit
 890          return distance - 1;
 891        when 5 .. 8 => -- Codes 4..5, with 1 extra bit
 892          return  4 + (distance - 5) / 2;
 893        when 9 .. 16 => -- Codes 6..7, with 2 extra bits
 894          return  6 + (distance - 9) / 4;
 895        when 17 .. 32 => -- Codes 8..9, with 3 extra bits
 896          return  8 + (distance - 17) / 8;
 897        when 33 .. 64 => -- Codes 10..11, with 4 extra bits
 898          return  10 + (distance - 33) / 16;
 899        when 65 .. 128 => -- Codes 12..13, with 5 extra bits
 900          return  12 + (distance - 65) / 32;
 901        when 129 .. 256 => -- Codes 14..15, with 6 extra bits
 902          return  14 + (distance - 129) / 64;
 903        when 257 .. 512 => -- Codes 16..17, with 7 extra bits
 904          return  16 + (distance - 257) / 128;
 905        when 513 .. 1024 => -- Codes 18..19, with 8 extra bits
 906          return  18 + (distance - 513) / 256;
 907        when 1025 .. 2048 => -- Codes 20..21, with 9 extra bits
 908          return  20 + (distance - 1025) / 512;
 909        when 2049 .. 4096 => -- Codes 22..23, with 10 extra bits
 910          return  22 + (distance - 2049) / 1024;
 911        when 4097 .. 8192 => -- Codes 24..25, with 11 extra bits
 912          return  24 + (distance - 4097) / 2048;
 913        when 8193 .. 16384 => -- Codes 26..27, with 12 extra bits
 914          return  26 + (distance - 8193) / 4096;
 915        when 16385 .. 32768 => -- Codes 28..29, with 13 extra bits
 916          return  28 + (distance - 16385) / 8192;
 917      end case;
 918    end Deflate_code_for_LZ_distance;
 919  
 920    -----------------
 921    --  LZ Buffer  --
 922    -----------------
 923  
 924    --  We buffer the LZ codes (plain, or distance/length) in order to
 925    --  analyse them and try to do smart things.
 926  
 927    max_expand : constant := 14;  --  *Tuned* Sometimes it is better to store data and expand short strings
 928    code_for_max_expand : constant := 266;
 929    subtype Expanded_data is Byte_Buffer (1 .. max_expand);
 930  
 931    type LZ_atom_kind is (plain_byte, distance_length);
 932    type LZ_atom is record
 933      kind          : LZ_atom_kind;
 934      plain         : Byte;
 935      lz_distance   : Natural;
 936      lz_length     : Natural;
 937      lz_expanded   : Expanded_data;
 938    end record;
 939  
 940    --  *Tuned*. Min: 2**14, = 16384 (min half buffer 8192)
 941    --  Optimal so far: 2**17
 942    LZ_buffer_size : constant := 2**17;
 943    type LZ_buffer_index_type is mod LZ_buffer_size;
 944    type LZ_buffer_type is array (LZ_buffer_index_type range <>) of LZ_atom;
 945  
 946    empty_lit_len_stat : constant Stats_lit_len_type := (End_Of_Block => 1, others => 0);
 947    --  End_Of_Block will have to happen once, but never appears in the LZ statistics...
 948    empty_dis_stat : constant Stats_dis_type := (others => 0);
 949  
 950    --
 951    --  Compute statistics for both Literal-length, and Distance alphabets, from a LZ buffer
 952    --
 953    procedure Get_statistics (
 954      lzb           :  in LZ_buffer_type;
 955      stats_lit_len : out Stats_lit_len_type;
 956      stats_dis     : out Stats_dis_type
 957    )
 958    is
 959      lit_len : Alphabet_lit_len;
 960      dis     : Alphabet_dis;
 961    begin
 962      stats_lit_len := empty_lit_len_stat;
 963      stats_dis     := empty_dis_stat;
 964      for i in lzb'Range loop
 965        case lzb (i).kind is
 966          when plain_byte =>
 967            lit_len := Alphabet_lit_len (lzb (i).plain);
 968            stats_lit_len (lit_len) := stats_lit_len (lit_len) + 1;     --  +1 for this literal
 969          when distance_length =>
 970            lit_len := deflate_code_for_lz_length (lzb (i).lz_length);
 971            stats_lit_len (lit_len) := stats_lit_len (lit_len) + 1;     --  +1 for this length code
 972            dis := Deflate_code_for_LZ_distance (lzb (i).lz_distance);
 973            stats_dis (dis) := stats_dis (dis) + 1;                     --  +1 for this distance code
 974        end case;
 975      end loop;
 976    end Get_statistics;
 977  
 978    --
 979    --  Send a LZ buffer using currently defined Huffman codes
 980    --
 981    procedure Put_LZ_buffer (lzb : LZ_buffer_type) is
 982    begin
 983      for i in lzb'Range loop
 984        case lzb (i).kind is
 985          when plain_byte =>
 986            Put_literal_byte (lzb (i).plain);
 987          when distance_length =>
 988            Put_DL_code (lzb (i).lz_distance, lzb (i).lz_length);
 989        end case;
 990      end loop;
 991    end Put_LZ_buffer;
 992  
 993    block_to_finish : Boolean := False;
 994    last_block_marked : Boolean := False;
 995    type Block_type is (stored, fixed, dynamic, reserved);  --  Appnote, 5.5.2
 996    --  If last_block_type = dynamic, we may recycle previous block's Huffman codes
 997    last_block_type : Block_type := reserved;
 998  
 999    procedure Mark_new_block (last_block_for_stream : Boolean) is
1000    begin
1001      if block_to_finish and last_block_type in fixed .. dynamic then
1002        Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));  --  Finish previous block
1003      end if;
1004      block_to_finish := True;
1005      Put_Bits (code => Boolean'Pos (last_block_for_stream), code_size => 1);
1006      last_block_marked := last_block_for_stream;
1007    end Mark_new_block;
1008  
1009    --  Send a LZ buffer completely decoded as literals (LZ compression is discarded)
1010    procedure Expand_LZ_buffer (lzb : LZ_buffer_type; last_block : Boolean) is
1011      b1, b2 : Byte;
1012      to_be_sent : Natural_M32 := 0;
1013      --  to_be_sent is not always equal to lzb'Length: sometimes you have a DL code
1014      mid : LZ_buffer_index_type;
1015    begin
1016      for i in lzb'Range loop
1017        case lzb (i).kind is
1018          when plain_byte =>
1019            to_be_sent := to_be_sent + 1;
1020          when distance_length =>
1021            to_be_sent := to_be_sent + Natural_M32 (lzb (i).lz_length);
1022        end case;
1023      end loop;
1024      if to_be_sent > 16#FFFF# then  --  Ow, cannot send all that in one chunk.
1025        --  Instead of a tedious block splitting, just divide and conquer:
1026        mid := LZ_buffer_index_type ((Natural_M32 (lzb'First) + Natural_M32 (lzb'Last)) / 2);
1027        if trace then
1028          Put_Line (log,
1029            "Expand_LZ_buffer: splitting large stored block: " &
1030            LZ_buffer_index_type'Image (lzb'First) &
1031            LZ_buffer_index_type'Image (mid) &
1032            LZ_buffer_index_type'Image (lzb'Last)
1033          );
1034        end if;
1035        Expand_LZ_buffer (lzb (lzb'First .. mid), last_block => False);
1036        Expand_LZ_buffer (lzb (mid + 1   .. lzb'Last), last_block => last_block);
1037        return;
1038      end if;
1039      if trace then
1040        Put_Line (log, "Expand_LZ_buffer: sending" & Natural_M32'Image (to_be_sent) & " 'plain' bytes");
1041      end if;
1042      b1 := Byte (to_be_sent mod 256);
1043      b2 := Byte (to_be_sent  /  256);
1044      Mark_new_block (last_block_for_stream => last_block);
1045      last_block_type := stored;
1046      Put_Bits (code => 0, code_size => 2);  --  Signals a "stored" block
1047      Flush_bit_buffer;  --  Go to byte boundary
1048      Put_byte (b1);
1049      Put_byte (b2);
1050      Put_byte (not b1);
1051      Put_byte (not b2);
1052      for i in lzb'Range loop
1053        case lzb (i).kind is
1054          when plain_byte =>
1055            Put_byte (lzb (i).plain);
1056          when distance_length =>
1057            for j in 1 .. lzb (i).lz_length loop
1058              Put_byte (lzb (i).lz_expanded (j));
1059            end loop;
1060        end case;
1061      end loop;
1062    end Expand_LZ_buffer;
1063  
1064    --  Extra bits that need to be sent after various Deflate codes
1065  
1066    extra_bits_for_lz_length_code : constant array (257 .. 285) of Natural :=
1067       (257 .. 264 => 0,
1068        265 .. 268 => 1,
1069        269 .. 272 => 2,
1070        273 .. 276 => 3,
1071        277 .. 280 => 4,
1072        281 .. 284 => 5,
1073        285        => 0
1074      );
1075  
1076    extra_bits_for_lz_distance_code : constant array (0 .. 29) of Natural :=
1077       (0 ..  3 =>  0,
1078        4 ..  5 =>  1,
1079        6 ..  7 =>  2,
1080        8 ..  9 =>  3,
1081       10 .. 11 =>  4,
1082       12 .. 13 =>  5,
1083       14 .. 15 =>  6,
1084       16 .. 17 =>  7,
1085       18 .. 19 =>  8,
1086       20 .. 21 =>  9,
1087       22 .. 23 => 10,
1088       24 .. 25 => 11,
1089       26 .. 27 => 12,
1090       28 .. 29 => 13
1091      );
1092  
1093    subtype Long_length_codes is
1094      Alphabet_lit_len range code_for_max_expand + 1 .. Alphabet_lit_len'Last;
1095    zero_bl_long_lengths : constant Stats_type (Long_length_codes) := (others => 0);
1096  
1097    --  Send_as_block.
1098    --
1099    --  lzb (can be a slice of the principal buffer) will be sent as:
1100    --        * a new "dynamic" block, preceded by a compression structure header
1101    --    or  * the continuation of previous "dynamic" block
1102    --    or  * a new "fixed" block, if lz data's Huffman descriptor is close enough to "fixed"
1103    --    or  * a new "stored" block, if lz data are too random
1104  
1105    procedure Send_as_block (lzb : LZ_buffer_type; last_block : Boolean) is
1106      new_descr, new_descr_2 : Deflate_Huff_Descriptors;
1107      --
1108      procedure Send_fixed_block is
1109      begin
1110        if last_block_type = fixed then
1111          --  Cool, we don't need to mark a block boundary: the Huffman codes are already
1112          --  the expected ones. We can just continue sending the LZ atoms.
1113          null;
1114        else
1115          Mark_new_block (last_block_for_stream => last_block);
1116          curr_descr := Deflate_fixed_descriptors;
1117          Put_Bits (code => 1, code_size => 2);  --  Signals a "fixed" block
1118          last_block_type := fixed;
1119        end if;
1120        Put_LZ_buffer (lzb);
1121      end Send_fixed_block;
1122      --
1123      stats_lit_len, stats_lit_len_2 : Stats_lit_len_type;
1124      stats_dis, stats_dis_2 : Stats_dis_type;
1125      --
1126      procedure Send_dynamic_block (dyn : Deflate_Huff_Descriptors) is
1127        dummy : Count_type := 0;
1128      begin
1129        Mark_new_block (last_block_for_stream => last_block);
1130        curr_descr := Prepare_Huffman_Codes (dyn);
1131        Put_Bits (code => 2, code_size => 2);  --  Signals a "dynamic" block
1132        Put_Compression_Structure (curr_descr, cost_analysis => False, bits => dummy);
1133        Put_LZ_buffer (lzb);
1134        last_block_type := dynamic;
1135      end Send_dynamic_block;
1136      --  The following variables will contain the *exact* number of bits taken
1137      --  by the block to be sent, using different Huffman encodings, or stored.
1138      stored_format_bits,                      --  Block is stored (no compression)
1139      fixed_format_bits,                       --  Fixed (preset) Huffman codes
1140      dynamic_format_bits,                     --  Dynamic Huffman codes using block's statistics
1141      dynamic_format_bits_2,                   --  Dynamic Huffman codes after Tweak_for_better_RLE
1142      recycled_format_bits : Count_type := 0;  --  Continue previous block, use current Huffman codes
1143      --
1144      stored_format_possible : Boolean;  --  Can we store (needs expansion of DL codes) ?
1145      recycling_possible     : Boolean;  --  Can we recycle current Huffman codes ?
1146      --
1147      procedure Compute_sizes_of_variants is
1148        c     : Count_type;
1149        extra : Natural;
1150      begin
1151        --  We count bits taken by literals, for each block format variant.
1152        for i in 0 .. 255 loop
1153          c := stats_lit_len (i);  --  This literal appears c times in the LZ buffer
1154          stored_format_bits    := stored_format_bits    + 8                                               * c;
1155          fixed_format_bits     := fixed_format_bits     + Count_type (default_lit_len_bl (i))             * c;
1156          dynamic_format_bits   := dynamic_format_bits   + Count_type (new_descr.lit_len (i).bit_length)   * c;
1157          dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.lit_len (i).bit_length) * c;
1158          recycled_format_bits  := recycled_format_bits  + Count_type (curr_descr.lit_len (i).bit_length)  * c;
1159        end loop;
1160        --  We count bits taken by DL codes.
1161        if stored_format_possible then
1162          for i in lzb'Range loop
1163            case lzb (i).kind is
1164              when plain_byte =>
1165                null;  --  Already counted
1166              when distance_length =>
1167                 --  In the stored format, DL codes are expanded
1168                stored_format_bits := stored_format_bits + 8 * Count_type (lzb (i).lz_length);
1169            end case;
1170          end loop;
1171        end if;
1172        --  For compressed formats, count Huffman bits and extra bits.
1173        --  Lengths codes:
1174        for i in 257 .. 285 loop
1175          c := stats_lit_len (i);  --  This length code appears c times in the LZ buffer
1176          extra := extra_bits_for_lz_length_code (i);
1177          fixed_format_bits     := fixed_format_bits     + Count_type (default_lit_len_bl (i) + extra)             * c;
1178          dynamic_format_bits   := dynamic_format_bits   + Count_type (new_descr.lit_len (i).bit_length + extra)   * c;
1179          dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.lit_len (i).bit_length + extra) * c;
1180          recycled_format_bits  := recycled_format_bits  + Count_type (curr_descr.lit_len (i).bit_length + extra)  * c;
1181        end loop;
1182        --  Distance codes:
1183        for i in 0 .. 29 loop
1184          c := stats_dis (i);  --  This distance code appears c times in the LZ buffer
1185          extra := extra_bits_for_lz_distance_code (i);
1186          fixed_format_bits     := fixed_format_bits     + Count_type (default_dis_bl (i) + extra)             * c;
1187          dynamic_format_bits   := dynamic_format_bits   + Count_type (new_descr.dis (i).bit_length + extra)   * c;
1188          dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.dis (i).bit_length + extra) * c;
1189          recycled_format_bits  := recycled_format_bits  + Count_type (curr_descr.dis (i).bit_length + extra)  * c;
1190        end loop;
1191        --  Supplemental bits to be counted
1192        --
1193        stored_format_bits := stored_format_bits +
1194          (1 + (stored_format_bits / 8) / 65_535)  --  Number of stored blocks needed
1195          * 5  -- 5 bytes per header
1196          * 8; -- ... converted into bits
1197        --
1198        c := 1;  --  Is-last-block flag
1199        if block_to_finish and last_block_type in fixed .. dynamic then
1200          c := c + Count_type (curr_descr.lit_len (End_Of_Block).bit_length);
1201        end if;
1202        stored_format_bits    := stored_format_bits + c;
1203        fixed_format_bits     := fixed_format_bits + c + 2;
1204        dynamic_format_bits   := dynamic_format_bits + c + 2;
1205        dynamic_format_bits_2 := dynamic_format_bits_2 + c + 2;
1206        --  For both dynamic formats, we also counts the bits taken by the compression header!
1207        Put_Compression_Structure (new_descr,   cost_analysis => True, bits => dynamic_format_bits);
1208        Put_Compression_Structure (new_descr_2, cost_analysis => True, bits => dynamic_format_bits_2);
1209      end Compute_sizes_of_variants;
1210      --
1211      optimal_format_bits : Count_type;
1212    begin
1213      Get_statistics (lzb, stats_lit_len, stats_dis);
1214      new_descr := Build_descriptors (stats_lit_len, stats_dis);
1215      stats_lit_len_2 := stats_lit_len;
1216      stats_dis_2 := stats_dis;
1217      Tweak_for_better_RLE (stats_lit_len_2);
1218      Tweak_for_better_RLE (stats_dis_2);
1219      new_descr_2 := Build_descriptors (stats_lit_len_2, stats_dis_2);
1220      --  For "stored" block format, prevent expansion of DL codes with length > max_expand.
1221      --  We check stats are all 0 for long length codes:
1222      stored_format_possible := stats_lit_len (Long_length_codes) = zero_bl_long_lengths;
1223      recycling_possible :=
1224        last_block_type = fixed  --  The "fixed" alphabets use all symbols, then always recyclable.
1225          or else
1226        (last_block_type = dynamic and then Recyclable (curr_descr, new_descr));
1227      Compute_sizes_of_variants;
1228      if not stored_format_possible then
1229        stored_format_bits := Count_type'Last;
1230      end if;
1231      if not recycling_possible then
1232        recycled_format_bits := Count_type'Last;
1233      end if;
1234      optimal_format_bits := Count_type'Min (
1235        Count_type'Min (stored_format_bits, fixed_format_bits),
1236        Count_type'Min (
1237          Count_type'Min (dynamic_format_bits, dynamic_format_bits_2),
1238          recycled_format_bits)
1239      );
1240      --
1241      --  Selection of the block format with smallest size.
1242      --
1243      if fixed_format_bits = optimal_format_bits then
1244        if trace then
1245          Put_Line (log, "### New ""fixed"" block");
1246        end if;
1247        Send_fixed_block;
1248      elsif dynamic_format_bits = optimal_format_bits then
1249        if trace then
1250          Put_Line (log, "### New ""dynamic"" block with compression structure header");
1251        end if;
1252        Send_dynamic_block (new_descr);
1253      elsif dynamic_format_bits_2 = optimal_format_bits then
1254        if trace then
1255          Put_Line (log, "### New ""dynamic"" block, RLE-tweaked, with compression structure header");
1256        end if;
1257        Send_dynamic_block (new_descr_2);
1258      elsif recycled_format_bits = optimal_format_bits then
1259        if trace then
1260          Put_Line (log, "### Recycle: continue using existing Huffman compression structures");
1261        end if;
1262        Put_LZ_buffer (lzb);
1263      else  --  We have stored_format_bits = optimal_format_bits
1264        if trace then
1265          Put_Line (log, "### Too random - use ""stored"" block");
1266        end if;
1267        Expand_LZ_buffer (lzb, last_block);
1268      end if;
1269    end Send_as_block;
1270  
1271    subtype Full_range_LZ_buffer_type is LZ_buffer_type (LZ_buffer_index_type);
1272    type p_Full_range_LZ_buffer_type is access Full_range_LZ_buffer_type;
1273    procedure Dispose is
1274      new Ada.Unchecked_Deallocation (Full_range_LZ_buffer_type, p_Full_range_LZ_buffer_type);
1275  
1276    --  This is the main, big, fat, circular buffer containing LZ codes,
1277    --  each LZ code being a literal or a DL code.
1278    --  Heap allocation is needed only because default stack is too small on some targets.
1279    lz_buffer : p_Full_range_LZ_buffer_type := null;
1280    lz_buffer_index : LZ_buffer_index_type := 0;
1281    past_lz_data : Boolean := False;
1282    --  When True: some LZ_buffer_size data before lz_buffer_index (modulo!) are real, past data
1283  
1284    ---------------------------------------------------------------------------------
1285    --  Scanning and sampling: the really sexy part of the Taillaule algorithm...  --
1286    ---------------------------------------------------------------------------------
1287  
1288    --  We examine similarities in the LZ data flow at different step sizes.
1289    --  If the optimal Huffman encoding for this portion is very different, we choose to
1290    --  cut current block and start a new one. The shorter the step, the higher the threshold
1291    --  for starting a dynamic block, since the compression header is taking some room each time.
1292  
1293    --  *Tuned* (a bit...)
1294    min_step : constant := 750;
1295  
1296    type Step_threshold_metric is record
1297      slider_step       : LZ_buffer_index_type;  --  Should be a multiple of min_step.
1298      cutting_threshold : Positive;
1299      metric            : Distance_type;
1300    end record;
1301  
1302    --  *Tuned* thresholds
1303    --  NB: the enwik8, then silesia, then others tests are tough for lowering any!
1304    step_choice : constant array (Positive range <>) of Step_threshold_metric :=
1305       ((8 * min_step,  420, L1_tweaked),  --  Deflate_1, Deflate_2, Deflate_3 (enwik8)
1306        (4 * min_step,  430, L1_tweaked),  --             Deflate_2, Deflate_3 (silesia)
1307            (min_step, 2050, L1_tweaked)   --                        Deflate_3 (DB test)
1308      );
1309  
1310    max_choice : constant array (Taillaule_Deflation_Method) of Positive :=
1311      (Deflate_1 => 1, Deflate_2 => 2, others => step_choice'Last);
1312  
1313    slider_size : constant := 4096;
1314    half_slider_size : constant := slider_size / 2;
1315    slider_max : constant := slider_size - 1;
1316  
1317    --  Phases (A) and (B) are done in a single function: we get Huffman
1318    --  descriptors that should be good for encoding a given sequence of LZ atoms.
1319    function Build_descriptors (lzb : LZ_buffer_type) return Deflate_Huff_Descriptors is
1320      stats_lit_len : Stats_lit_len_type;
1321      stats_dis     : Stats_dis_type;
1322    begin
1323      Get_statistics (lzb, stats_lit_len, stats_dis);
1324      return Build_descriptors (stats_lit_len, stats_dis);
1325    end Build_descriptors;
1326  
1327    procedure Scan_and_send_from_main_buffer (from, to : LZ_buffer_index_type; last_flush : Boolean) is
1328      --  The following descriptors are *not* used for compressing, but for detecting similarities.
1329      initial_hd, sliding_hd : Deflate_Huff_Descriptors;
1330      start, slide_mid, send_from : LZ_buffer_index_type;
1331      sliding_hd_computed : Boolean;
1332    begin
1333      if to - from < slider_max then
1334        Send_as_block (lz_buffer (from .. to), last_flush);
1335        return;
1336      end if;
1337      --  For further comments: n := LZ_buffer_size
1338      if past_lz_data then  --  We have n / 2 previous data before 'from'.
1339        start := from - LZ_buffer_index_type (half_slider_size);
1340      else
1341        start := from;  --  Cannot have past data
1342      end if;
1343      if start > from then  --  Looped over, (mod n). Slider data are in two chunks in main buffer
1344        --  put_line(from'img & to'img & start'img);
1345        declare
1346          copy_from : LZ_buffer_index_type := start;
1347          copy : LZ_buffer_type (0 .. slider_max);
1348        begin
1349          for i in copy'Range loop
1350            copy (i) := lz_buffer (copy_from);
1351            copy_from := copy_from + 1;  --  Loops over (mod n)
1352          end loop;
1353          initial_hd := Build_descriptors (copy);
1354        end;
1355        --  Concatenation instead of above loop bombs with a Range Check error:
1356        --  lz_buffer(start .. lz_buffer'Last) &
1357        --  lz_buffer(0 .. start + LZ_buffer_index_type(slider_max))
1358      else
1359        initial_hd := Build_descriptors (lz_buffer (start .. start + slider_max));
1360      end if;
1361      send_from := from;
1362      slide_mid := from + min_step;
1363      Scan_LZ_data :
1364      while Integer_M32 (slide_mid) + half_slider_size < Integer_M32 (to) loop
1365        exit Scan_LZ_data when deactivate_scanning;
1366        sliding_hd_computed := False;
1367        Browse_step_level :
1368        for level in step_choice'Range loop
1369          exit Browse_step_level when level > max_choice (method);
1370          if (slide_mid - from) mod step_choice (level).slider_step = 0 then
1371            if not sliding_hd_computed then
1372              sliding_hd := Build_descriptors (lz_buffer (slide_mid - half_slider_size .. slide_mid + half_slider_size));
1373              sliding_hd_computed := True;
1374            end if;
1375            if not Similar (
1376              initial_hd,
1377              sliding_hd,
1378              step_choice (level).metric,
1379              step_choice (level).cutting_threshold,
1380              "Compare sliding to initial (step size=" &
1381              LZ_buffer_index_type'Image (step_choice (level).slider_step) & ')'
1382            )
1383            then
1384              if trace then
1385                Put_Line (log,
1386                  "### Cutting @ " & LZ_buffer_index_type'Image (slide_mid) &
1387                  "  ('from' is" & LZ_buffer_index_type'Image (from) &
1388                  ", 'to' is" & LZ_buffer_index_type'Image (to) & ')'
1389                );
1390              end if;
1391              Send_as_block (lz_buffer (send_from .. slide_mid - 1), last_block => False);
1392              send_from := slide_mid;
1393              initial_hd := sliding_hd;  --  Reset reference descriptor for further comparisons
1394              exit Browse_step_level;  --  Cutting once at a given place is enough :-)
1395            end if;
1396          end if;
1397        end loop Browse_step_level;
1398        --  Exit before an eventual increment of slide_mid that would loop over (mod n).
1399        exit Scan_LZ_data when Integer_M32 (slide_mid) + min_step + half_slider_size >= Integer_M32 (to);
1400        slide_mid := slide_mid + min_step;
1401      end loop Scan_LZ_data;
1402      --
1403      --  Send last block for slice from .. to.
1404      --
1405      if send_from <= to then
1406        Send_as_block (lz_buffer (send_from .. to), last_block => last_flush);
1407      end if;
1408    end Scan_and_send_from_main_buffer;
1409  
1410    procedure Flush_half_buffer (last_flush : Boolean) is
1411      last_idx : constant LZ_buffer_index_type := lz_buffer_index - 1;
1412      n_div_2 : constant := LZ_buffer_size / 2;
1413    begin
1414      if last_idx < n_div_2 then
1415        Scan_and_send_from_main_buffer (0, last_idx, last_flush);        -- 1st half
1416      else
1417        Scan_and_send_from_main_buffer (n_div_2, last_idx, last_flush);  -- 2nd half
1418      end if;
1419      --  From this point, all further calls to Flush_half_buffer will
1420      --  have n_div_2 elements of past data.
1421      past_lz_data := True;
1422    end Flush_half_buffer;
1423  
1424    procedure Push (a : LZ_atom) is
1425    pragma Inline (Push);
1426    begin
1427      lz_buffer (lz_buffer_index) := a;
1428      lz_buffer_index := lz_buffer_index + 1;  --  becomes 0 when reaching LZ_buffer_size (modular)
1429      if lz_buffer_index * 2 = 0 then
1430        Flush_half_buffer (last_flush => False);
1431      end if;
1432    end Push;
1433  
1434    procedure Put_or_delay_literal_byte (b : Byte) is
1435    pragma Inline (Put_or_delay_literal_byte);
1436    begin
1437      case method is
1438        when Deflate_Fixed =>
1439          Put_literal_byte (b);  --  Buffering is not needed in this mode
1440        when Taillaule_Deflation_Method =>
1441          Push ((plain_byte, b, 0, 0, (b, others => 0)));
1442      end case;
1443    end Put_or_delay_literal_byte;
1444  
1445    procedure Put_or_delay_DL_code (distance, length : Integer; expand : Expanded_data) is
1446    pragma Inline (Put_or_delay_DL_code);
1447    begin
1448      case method is
1449        when Deflate_Fixed =>
1450          Put_DL_code (distance, length);  --  Buffering is not needed in this mode
1451        when Taillaule_Deflation_Method =>
1452          Push ((distance_length, 0, distance, length, expand));
1453      end case;
1454    end Put_or_delay_DL_code;
1455  
1456    --------------------------------
1457    -- LZ77 front-end compression --
1458    --------------------------------
1459  
1460    procedure Encode is
1461  
1462      feedback_milestone,
1463      Bytes_in   : Zip_Streams.ZS_Size_Type := 0;   --  Count of input file bytes processed
1464      user_aborting : Boolean;
1465      PctDone : Natural;
1466  
1467      function Read_byte return Byte is
1468        b : Byte;
1469        use Zip_Streams;
1470      begin
1471        b := IO_buffers.InBuf (IO_buffers.InBufIdx);
1472        IO_buffers.InBufIdx := IO_buffers.InBufIdx + 1;
1473        Zip.CRC_Crypto.Update (CRC, (1 => b));
1474        Bytes_in := Bytes_in + 1;
1475        if feedback /= null then
1476          if Bytes_in = 1 then
1477            feedback (0, False, user_aborting);
1478          end if;
1479          if feedback_milestone > 0 and then
1480             ((Bytes_in - 1) mod feedback_milestone = 0
1481              or Bytes_in = ZS_Size_Type (input_size))
1482          then
1483            if input_size_known then
1484              PctDone := Integer ((100.0 * Float (Bytes_in)) / Float (input_size));
1485              feedback (PctDone, False, user_aborting);
1486            else
1487              feedback (0, False, user_aborting);
1488            end if;
1489            if user_aborting then
1490              raise User_abort;
1491            end if;
1492          end if;
1493        end if;
1494        return b;
1495      end Read_byte;
1496  
1497      function More_bytes return Boolean is
1498      begin
1499        if IO_buffers.InBufIdx > IO_buffers.MaxInBufIdx then
1500          Read_Block (IO_buffers, input);
1501        end if;
1502        return not IO_buffers.InputEoF;
1503      end More_bytes;
1504  
1505      --  LZ77 parameters
1506      Look_Ahead_LZ77    : constant Integer := 258;
1507      String_buffer_size : constant := 2**15;  --  Required: 2**15 for Deflate, 2**16 for Deflate_e
1508      type Text_buffer_index is mod String_buffer_size;
1509      type Text_buffer is array (Text_buffer_index) of Byte;
1510      Text_Buf : Text_buffer;
1511      R : Text_buffer_index;
1512  
1513      --  If the DLE coding doesn't fit the format constraints, we need
1514      --  to decode it as a simple sequence of literals. The buffer used is
1515      --  called "Text" buffer by reference to "clear-text", but actually it
1516      --  is any binary data.
1517  
1518      procedure LZ77_emits_DL_code (distance, length : Integer) is
1519        --  NB: no worry, all arithmetics in Text_buffer_index are modulo String_buffer_size.
1520        b : Byte;
1521        copy_start : Text_buffer_index;
1522        expand : Expanded_data;
1523        ie : Positive := 1;
1524      begin
1525        if distance = String_buffer_size then  --  Happens with 7-Zip, cannot happen with Info-Zip.
1526          copy_start := R;
1527        else
1528          copy_start := R - Text_buffer_index (distance);
1529        end if;
1530        --  Expand into the circular text buffer to have it up to date
1531        for K in 0 .. Text_buffer_index (length - 1) loop
1532          b := Text_Buf (copy_start + K);
1533          Text_Buf (R) := b;
1534          R := R + 1;
1535          if ie <= max_expand then  --  Also memorize short sequences for LZ buffer
1536            expand (ie) := b;       --  for the case a block needs to be stored in clear.
1537            ie := ie + 1;
1538          end if;
1539        end loop;
1540        if distance in Distance_range and length in Length_range then
1541          Put_or_delay_DL_code (distance, length, expand);
1542        else
1543          if trace then
1544            Put_Line (log,
1545              "<> Too bad, cannot encode this distance-length pair, " &
1546              "then we have to expand to output (dist = " & Integer'Image (distance) &
1547              " len=" & Integer'Image (length) & ")"
1548            );
1549          end if;
1550          for K in 0 .. Text_buffer_index (length - 1) loop
1551            Put_or_delay_literal_byte (Text_Buf (copy_start + K));
1552          end loop;
1553        end if;
1554      end LZ77_emits_DL_code;
1555  
1556      procedure LZ77_emits_literal_byte (b : Byte) is
1557      begin
1558        Text_Buf (R) := b;
1559        R := R + 1;
1560        Put_or_delay_literal_byte (b);
1561      end LZ77_emits_literal_byte;
1562  
1563      procedure Dummy_Estimate_DL_Codes (
1564        matches          : in out LZ77.Matches_Array;
1565        old_match_index  : in     Natural;
1566        prefixes         : in     LZ77.Byte_Array;
1567        best_score_index :    out Positive;
1568        best_score_set   :    out LZ77.Prefetch_Index_Type;
1569        match_trace      :    out LZ77.DLP_Array
1570      )
1571      is null;
1572  
1573      LZ77_choice : constant array (Deflation_Method) of LZ77.Method_Type :=
1574        (Deflate_Fixed  => LZ77.IZ_4,
1575         Deflate_0      => LZ77.No_LZ77,
1576         Deflate_1      => LZ77.IZ_6,     --  level 6 is the default in Info-Zip's zip.exe
1577         Deflate_2      => LZ77.IZ_8,
1578         Deflate_3      => LZ77.IZ_10,
1579         Deflate_R      => LZ77.Rich);
1580  
1581      procedure My_LZ77 is
1582        new LZ77.Encode
1583           (String_buffer_size => String_buffer_size,
1584            Look_Ahead         => Look_Ahead_LZ77,
1585            Threshold          => 2,  --  From a string match length > 2, a DL code is sent
1586            Method             => LZ77_choice (method),
1587            Read_Byte          => Read_byte,
1588            More_Bytes         => More_bytes,
1589            Write_Literal      => LZ77_emits_literal_byte,
1590            Write_DL_Code      => LZ77_emits_DL_code,
1591            Estimate_DL_Codes  => Dummy_Estimate_DL_Codes
1592          );
1593  
1594    begin  --  Encode
1595      Read_Block (IO_buffers, input);
1596      R := Text_buffer_index (String_buffer_size - Look_Ahead_LZ77);
1597      if input_size_known then
1598        feedback_milestone := Zip_Streams.ZS_Size_Type (input_size / feedback_steps);
1599      end if;
1600      case method is
1601        when Deflate_Fixed =>  --  "Fixed" (predefined) compression structure
1602          --  We have only one compressed data block, then it is already the last one.
1603          Put_Bits (code => 1, code_size => 1);  --  Signals last block
1604          Put_Bits (code => 1, code_size => 2);  --  Signals a "fixed" block
1605        when Taillaule_Deflation_Method =>
1606          null;  --  No start data sent, all is delayed
1607      end case;
1608      ----------------------------------------------------------------
1609      --  The whole compression is happening in the following line: --
1610      ----------------------------------------------------------------
1611      My_LZ77;
1612      --  Done. Send the code signaling the end of compressed data block:
1613      case method is
1614        when Deflate_Fixed =>
1615          Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1616        when Taillaule_Deflation_Method =>
1617          if lz_buffer_index * 2 = 0 then  --  Already flushed at latest Push, or empty data
1618            if block_to_finish and then last_block_type in fixed .. dynamic then
1619              Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1620            end if;
1621          else
1622            Flush_half_buffer (last_flush => True);
1623            if last_block_type in fixed .. dynamic then
1624              Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1625            end if;
1626          end if;
1627          if not last_block_marked then
1628            --  Add a fake fixed block, just to have a final block...
1629            Put_Bits (code => 1, code_size => 1);  --  Signals last block
1630            Put_Bits (code => 1, code_size => 2);  --  Signals a "fixed" block
1631            curr_descr := Deflate_fixed_descriptors;
1632            Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1633          end if;
1634      end case;
1635    end Encode;
1636  
1637    procedure Deallocation is
1638    begin
1639      Dispose (lz_buffer);
1640      Deallocate_Buffers (IO_buffers);
1641    end Deallocation;
1642  
1643  begin
1644    if trace then
1645      begin
1646        Open (log, Append_File, log_name);
1647      exception
1648        when Name_Error =>
1649          Create (log, Out_File, log_name);
1650      end;
1651      Put (log, "New stream" & sep & sep & sep & sep & sep & sep & sep & sep);
1652      if input_size_known then
1653        Put (log, sep & Zip_64_Data_Size_Type'Image (input_size) &
1654                 sep & sep & sep & sep & sep & sep & "bytes input");
1655      end if;
1656      New_Line (log);
1657    end if;
1658    Allocate_Buffers (IO_buffers, input_size_known, input_size);
1659    output_size := 0;
1660    lz_buffer := new Full_range_LZ_buffer_type;
1661    begin
1662      Encode;
1663      compression_ok := True;
1664      Flush_bit_buffer;
1665      Flush_byte_buffer;
1666    exception
1667      when Compression_inefficient =>  --  Escaped from Encode
1668        compression_ok := False;
1669    end;
1670    if trace then
1671      Close (log);
1672    end if;
1673    Deallocation;
1674  exception
1675    when others =>
1676      Deallocation;
1677      raise;
1678  end Zip.Compress.Deflate;

Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other Ada projects on Gautier's blog.