Back to... Zip-Ada

Source file : lz77.adb



   1  --  There are four LZ77 encoders at choice in this package:
   2  --
   3  --    1/  LZ77_using_LZHuf, based on LZHuf by Haruhiko Okumura and Haruyasu Yoshizaki.
   4  --
   5  --    2/  LZ77_using_IZ, based on Info-Zip's Zip's deflate.c by Jean-Loup Gailly.
   6  --          deflate.c is actually the LZ77 part of Info-Zip's compression.
   7  --
   8  --    3/  LZ77_using_BT4, based on LZMA SDK's BT4 algorithm by Igor Pavlov.
   9  --
  10  --    4/  LZ77_by_Rich, based on PROG2.C by Rich Geldreich, Jr.
  11  --
  12  --  Variant 1/, LZ77_using_LZHuf, is working since 2009. Two problems: it is slow
  13  --     and not well adapted to the Deflate format (mediocre compression).
  14  --
  15  --  Variant 2/, LZ77_using_IZ, is much faster, and better suited for Deflate.
  16  --     It has been added on 05-Mar-2016.
  17  --     The code is tailored and optimized for a single set of
  18  --     the String_buffer_size, Look_Ahead, Threshold LZ77 parameters - those for Deflate.
  19  --
  20  --  Variant 3/, LZ77_using_BT4, was added on 06-Sep-2016.
  21  --     The seems to be the best match finder for LZMA on data of the >= 1 MiB scale.
  22  
  23  --  To do:
  24  --
  25  --  2/
  26  --    - LZ77 / IZ: similar to the test with TOO_FAR, try to cluster distances around
  27  --        values needing less extra bits (may not work at all...)
  28  --    - LZ77 / IZ: tune TOO_FAR (max: 32767), see http://optipng.sf.net/pngtech/too_far.html
  29  --        "TOO_FAR in zlib Is Not Too Far" for discussion
  30  
  31  --  Legal licensing note:
  32  
  33  --  Copyright (c) 2016 .. 2020 Gautier de Montmollin (maintainer of the Ada version)
  34  --  SWITZERLAND
  35  
  36  --  Permission is hereby granted, free of charge, to any person obtaining a copy
  37  --  of this software and associated documentation files (the "Software"), to deal
  38  --  in the Software without restriction, including without limitation the rights
  39  --  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  40  --  copies of the Software, and to permit persons to whom the Software is
  41  --  furnished to do so, subject to the following conditions:
  42  
  43  --  The above copyright notice and this permission notice shall be included in
  44  --  all copies or substantial portions of the Software.
  45  
  46  --  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  47  --  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  48  --  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  49  --  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  50  --  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  51  --  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  52  --  THE SOFTWARE.
  53  
  54  --  NB: this is the MIT License, as found 21-Aug-2016 on the site
  55  --  http://www.opensource.org/licenses/mit-license.php
  56  
  57  with Ada.Text_IO, Ada.Integer_Text_IO;
  58  with Ada.Unchecked_Deallocation;
  59  with Interfaces; use Interfaces;
  60  with System;
  61  
  62  package body LZ77 is
  63  
  64    --  System.Word_Size: 13.3(8): A word is the largest amount of storage
  65    --  that can be conveniently and efficiently manipulated by the hardware,
  66    --  given the implementation's run-time model.
  67    --
  68    min_bits_32 : constant := Integer'Max (32, System.Word_Size);
  69    min_bits_16 : constant := Integer'Max (16, System.Word_Size);
  70  
  71    --  We define an Integer type which is at least 32 bits, but n bits
  72    --  on a native n (> 32) bits architecture (no performance hit on 64+
  73    --  bits architectures).
  74    --  Integer_M16 not needed: Integer already guarantees 16 bits
  75    --
  76    type Integer_M32 is range -2**(min_bits_32 - 1) .. 2**(min_bits_32 - 1) - 1;
  77    subtype Natural_M32  is Integer_M32 range 0 .. Integer_M32'Last;
  78    --  subtype Positive_M32 is Integer_M32 range 1 .. Integer_M32'Last;
  79  
  80    type Unsigned_M16 is mod 2**min_bits_16;
  81    type Unsigned_M32 is mod 2**min_bits_32;
  82  
  83    function Are_Matches_Sorted (m : Matches_Type) return Boolean is
  84    begin
  85      for i in 2 .. m.count loop
  86        if m.dl (i).length < m.dl (i - 1).length then
  87          return False;
  88        end if;
  89      end loop;
  90      return True;
  91    end Are_Matches_Sorted;
  92  
  93    procedure Encode is
  94  
  95      -----------------------
  96      --  LZHuf algorithm  --
  97      -----------------------
  98  
  99      procedure LZ77_using_LZHuf is
 100        --  Based on LZHUF by OKUMURA & YOSHIZAKI.
 101        --  Here the adaptive Huffman coding is thrown away:
 102        --  algorithm is used only to find matching patterns.
 103  
 104        N_Char    : constant Integer := 256 - Threshold + Look_Ahead;
 105        --  Character code (= 0..N_CHAR-1)
 106        Max_Table     : constant Integer := N_Char * 2 - 1;
 107  
 108        type Text_Buffer is array (0 .. String_buffer_size + Look_Ahead - 1) of Byte;
 109        empty_buffer : constant Text_Buffer := (others => 32);  --  ' '
 110  
 111        --  > The Huffman frequency handling is made generic so we have
 112        --    one copy of the tree and of the frequency table for Encode
 113        --    and one for Decode
 114  
 115        generic
 116        package Huffman is
 117          --- Pointing parent nodes.
 118          --- Area [Max_Table..(Max_Table + N_CHAR - 1)] are pointers for leaves
 119          Parent :  array (0 .. Max_Table + N_Char - 1) of Natural;
 120          --- Pointing children nodes (son[], son[] + 1)
 121          Son    :  array (0 .. Max_Table - 1)  of Natural;
 122  
 123          Root_Position : constant Integer := Max_Table - 1;  --  (can be always Son'last ?)
 124  
 125          procedure Start;
 126          procedure Update_Freq_Tree (C0 : Natural);
 127        end Huffman;
 128  
 129        package body Huffman is
 130  
 131          Freq : array (0 .. Max_Table) of Natural;  --  Cumulative freq table
 132  
 133          Max_Freq : constant := 16#8000#;
 134          --  ^-- update when cumulative frequency reaches to this value
 135  
 136          procedure Start is
 137            I : Natural;
 138          begin
 139            for J in  0 .. N_Char - 1  loop
 140              Freq (J) := 1;
 141              Son (J) := J + Max_Table;
 142              Parent (J + Max_Table) := J;
 143            end loop;
 144  
 145            I := 0;
 146            for J in N_Char .. Root_Position  loop
 147              Freq (J) := Freq (I) + Freq (I + 1);
 148              Son (J) := I;
 149              Parent (I) := J;
 150              Parent (I + 1) := J;
 151              I := I + 2;
 152            end loop;
 153  
 154            Freq (Freq'Last) := 16#FFFF#;  --  ( Max_Table )
 155            Parent (Root_Position) := 0;
 156          end Start;
 157  
 158          procedure Update_Freq_Tree (C0 : Natural) is
 159  
 160            procedure Reconstruct_Freq_Tree is
 161              I, J, K, F, L : Natural;
 162            begin
 163              --  Halven cumulative freq for leaf nodes
 164              J := 0;
 165              for I in 0 .. Root_Position loop
 166                if Son (I) >= Max_Table then
 167                  Freq (J) := (Freq (I) + 1) / 2;
 168                  Son (J) := Son (I);
 169                  J := J + 1;
 170                end if;
 171              end loop;
 172  
 173              --  Make a tree : first, connect children nodes
 174              I := 0;
 175              for J in N_Char .. Root_Position loop  --  J : free nodes
 176                K := I + 1;
 177                F := Freq (I) + Freq (K); -- new frequency
 178                Freq (J) := F;
 179                K := J - 1;
 180                while F < Freq (K) loop
 181                  K := K - 1;
 182                end loop;
 183  
 184                K := K + 1;
 185                L := J - K;  --  2007: fix: was L:= (J-K)*2, memcopy parameter remain
 186  
 187                Freq (K + 1 .. K + L) := Freq (K .. K + L - 1);  --  shift by one cell right
 188                Freq (K) := F;
 189                Son (K + 1 .. K + L) := Son (K .. K + L - 1);  --  shift by one cell right
 190                Son (K) := I;
 191                I := I + 2;
 192              end loop;
 193  
 194              --  Connect parent nodes
 195              for I in 0 .. Max_Table - 1 loop
 196                K := Son (I);
 197                Parent (K) := I;
 198                if K < Max_Table then
 199                  Parent (K + 1) := I;
 200                end if;
 201              end loop;
 202  
 203            end Reconstruct_Freq_Tree;
 204  
 205            C, I, J, K, L : Natural;
 206  
 207          begin  --  Update_Freq_Tree;
 208            if Freq (Root_Position) = Max_Freq then
 209              Reconstruct_Freq_Tree;
 210            end if;
 211            C := Parent (C0 + Max_Table);
 212            loop
 213              Freq (C) := Freq (C) + 1;
 214              K := Freq (C);
 215              --  Swap nodes to keep the tree freq-ordered
 216              L := C + 1;
 217              if  K > Freq (L) then
 218                while K > Freq (L + 1) loop
 219                  L := L + 1;
 220                end loop;
 221  
 222                Freq (C) := Freq (L);
 223                Freq (L) := K;
 224  
 225                I := Son (C);
 226                Parent (I) := L;
 227                if I < Max_Table then
 228                  Parent (I + 1) := L;
 229                end if;
 230  
 231                J := Son (L);
 232                Son (L) := I;
 233  
 234                Parent (J) := C;
 235                if J < Max_Table then
 236                  Parent (J + 1) := C;
 237                end if;
 238                Son (C) := J;
 239  
 240                C := L;
 241              end if;
 242              C := Parent (C);
 243              exit when C = 0;
 244            end loop;        -- do it until reaching the root
 245          end Update_Freq_Tree;
 246  
 247        end Huffman;
 248  
 249        Node_Nil : constant Integer := String_buffer_size;    --  End of tree's node
 250  
 251        Lson, Dad :  array (0 .. String_buffer_size) of Natural;
 252        Rson :       array (0 .. String_buffer_size + 256) of Natural;
 253  
 254        procedure Init_Tree is
 255        begin
 256          for I in String_buffer_size + 1 .. Rson'Last loop
 257            Rson (I) := Node_Nil;
 258          end loop;  --  root
 259          for I in 0 .. String_buffer_size - 1 loop
 260            Dad (I) := Node_Nil;
 261          end loop;  --  node
 262        end Init_Tree;
 263  
 264        Match_Position : Natural;
 265        Match_Length   : Natural;
 266  
 267        Text_Buf : Text_Buffer := empty_buffer;
 268  
 269        procedure Insert_Node (R : Integer) is
 270          I, P : Integer;
 271          Geq  : Boolean := True;
 272          C    : Natural;
 273        begin
 274          P := String_buffer_size + 1 + Integer (Text_Buf (R));
 275          Rson (R) := Node_Nil;
 276          Lson (R) := Node_Nil;
 277          Match_Length := 0;
 278          loop
 279            if Geq then
 280              if Rson (P) = Node_Nil then
 281                Rson (P) := R;
 282                Dad (R) := P;
 283                return;
 284              end if;
 285              P := Rson (P);
 286            else
 287              if Lson (P) = Node_Nil then
 288                Lson (P) := R;
 289                Dad (R) := P;
 290                return;
 291              end if;
 292              P := Lson (P);
 293            end if;
 294            I := 1;
 295            while I < Look_Ahead and then Text_Buf (R + I) = Text_Buf (P + I)  loop
 296              I := I + 1;
 297            end loop;
 298  
 299            Geq := Text_Buf (R + I) >= Text_Buf (P + I) or I = Look_Ahead;
 300  
 301            if I > Threshold then
 302              if I > Match_Length then
 303                Match_Position := (R - P) mod String_buffer_size - 1;
 304                Match_Length := I;
 305                exit when Match_Length >= Look_Ahead;
 306              end if;
 307              if I = Match_Length then
 308                C := (R - P) mod String_buffer_size - 1;
 309                if C < Match_Position then
 310                  Match_Position := C;
 311                end if;
 312              end if;
 313            end if;
 314          end loop;
 315  
 316          Dad (R)  := Dad (P);
 317          Lson (R) := Lson (P);
 318          Rson (R) := Rson (P);
 319          Dad (Lson (P)) := R;
 320          Dad (Rson (P)) := R;
 321          if Rson (Dad (P)) = P then
 322            Rson (Dad (P)) := R;
 323          else
 324            Lson (Dad (P)) := R;
 325          end if;
 326          Dad (P) := Node_Nil;  --  remove P
 327        end Insert_Node;
 328  
 329        procedure Delete_Node (P : Natural) is
 330          Q : Natural;
 331        begin
 332          if Dad (P) = Node_Nil then  --  unregistered
 333            return;
 334          end if;
 335          if Rson (P) = Node_Nil then
 336            Q := Lson (P);
 337          elsif Lson (P) = Node_Nil then
 338            Q := Rson (P);
 339          else
 340            Q := Lson (P);
 341            if Rson (Q) /= Node_Nil then
 342              loop
 343                Q := Rson (Q);
 344                exit when Rson (Q) = Node_Nil;
 345              end loop;
 346              Rson (Dad (Q)) := Lson (Q);
 347              Dad (Lson (Q)) := Dad (Q);
 348              Lson (Q) := Lson (P);
 349              Dad (Lson (P)) := Q;
 350            end if;
 351            Rson (Q) := Rson (P);
 352            Dad (Rson (P)) := Q;
 353          end if;
 354          Dad (Q) := Dad (P);
 355          if Rson (Dad (P)) = P then
 356            Rson (Dad (P)) := Q;
 357          else
 358            Lson (Dad (P)) := Q;
 359          end if;
 360          Dad (P) := Node_Nil;
 361        end Delete_Node;
 362  
 363        package Huffman_E is new Huffman;
 364  
 365        I, R, S, Last_Match_Length : Natural;
 366        Len : Integer;
 367        C : Byte;
 368      begin
 369        if not More_Bytes then
 370          return;
 371        end if;
 372        Huffman_E.Start;
 373        Init_Tree;
 374        S := 0;
 375        R := String_buffer_size - Look_Ahead;
 376        Len := 0;
 377        while Len < Look_Ahead and More_Bytes loop
 378          Text_Buf (R + Len) := Read_Byte;
 379          Len := Len + 1;
 380        end loop;
 381  
 382        --  Seems: fill dictionary with default value
 383        --
 384        --  for I in 1.. Look_Ahead loop
 385        --    Insert_Node(R - I);
 386        --  end loop;
 387  
 388        Insert_Node (R);
 389  
 390        loop
 391          if Match_Length > Len then
 392            Match_Length := Len;
 393          end if;
 394          if Match_Length <= Threshold then
 395            Match_Length := 1;
 396            Huffman_E.Update_Freq_Tree (Natural (Text_Buf (R)));
 397            Write_Literal (Text_Buf (R));
 398          else
 399            Write_DL_Code (Match_Position + 1, Match_Length);
 400          end if;
 401          Last_Match_Length := Match_Length;
 402          I := 0;
 403          while I < Last_Match_Length and More_Bytes loop
 404            I := I + 1;
 405            Delete_Node (S);
 406            C := Read_Byte;
 407            Text_Buf (S) := C;
 408            if  S < Look_Ahead - 1 then
 409              Text_Buf (S + String_buffer_size) := C;
 410            end if;
 411            S := (S + 1) mod String_buffer_size;
 412            R := (R + 1) mod String_buffer_size;
 413            Insert_Node (R);
 414          end loop;
 415  
 416          while I < Last_Match_Length loop
 417            I := I + 1;
 418            Delete_Node (S);
 419            S := (S + 1) mod String_buffer_size;
 420            R := (R + 1) mod String_buffer_size;
 421            Len := Len - 1;
 422            if Len > 0 then
 423              Insert_Node (R);
 424            end if;
 425          end loop;
 426  
 427          exit when Len = 0;
 428        end loop;
 429      end LZ77_using_LZHuf;
 430  
 431      --------------------------
 432      --  Info-Zip algorithm  --
 433      --------------------------
 434  
 435      --  LZ77_using_IZ: based on deflate.c by Jean-Loup Gailly.
 436      --  Core description of the algorithm:
 437      --
 438      --     The most straightforward technique turns out to be the fastest for
 439      --     most input files: try all possible matches and select the longest.
 440      --     The key feature of this algorithm is that insertions into the string
 441      --     dictionary are very simple and thus fast, and deletions are avoided
 442      --     completely. Insertions are performed at each input character, whereas
 443      --     string matches are performed only when the previous match ends. So it
 444      --     is preferable to spend more time in matches to allow very fast string
 445      --     insertions and avoid deletions. The matching algorithm for small
 446      --     strings is inspired from that of Rabin & Karp [1]. A brute force approach
 447      --     is used to find longer strings when a small match has been found.
 448      --
 449      --     The idea of lazy evaluation of matches is due to Jan-Mark Wams.
 450      --
 451      --     [1] A description of the Rabin and Karp algorithm is given in the book
 452      --         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
 453      --
 454      --  About hashing: chapter 6.4 of The Art of Computer Programming, Volume 3, D.E. Knuth
 455      --  Rabin and Karp algorithm: http://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
 456  
 457      --  Compression level: 0: store, 1: best speed, 9: best compression, 10: variant of level 9
 458      --  Ada code: only levels 4 to 10 are supported.
 459  
 460      procedure LZ77_using_IZ (level : Natural) is
 461        HASH_BITS : constant := 15;  --  13..15
 462        HASH_SIZE : constant := 2 ** HASH_BITS;
 463        HASH_MASK : constant := HASH_SIZE - 1;
 464        WSIZE     : constant Integer_M32 := Integer_M32 (String_buffer_size);
 465        WMASK     : constant Unsigned_M16 := Unsigned_M16 (WSIZE - 1);
 466        --  HASH_SIZE and WSIZE must be powers of two
 467        NIL      : constant := 0;     --  Tail of hash chains
 468        TOO_FAR  : constant := 4096;  --  Matches of length 3 are discarded if their distance exceeds TOO_FAR
 469        --
 470        subtype ulg is Unsigned_M32;
 471        subtype unsigned is Unsigned_M16;
 472        subtype ush is Unsigned_M16;
 473        --  subtype long is Integer_M32;
 474        --  subtype int is Integer;
 475        subtype Pos is Unsigned_M32;  --  must be at least 32 bits
 476        --  subtype IPos is unsigned;
 477        --  A Pos is an index in the character window. IPos is used only for parameter passing.
 478        window : array (0 .. 2 * WSIZE - 1) of Byte;
 479        --  Sliding window. Input bytes are read into the second half of the window,
 480        --  and move to the first half later to keep a dictionary of at least WSIZE
 481        --  bytes. With this organization, matches are limited to a distance of
 482        --  WSIZE-MAX_MATCH bytes, but this ensures that IO is always
 483        --  performed with a length multiple of the block size.
 484        prev : array (0 .. unsigned (WSIZE - 1)) of Pos;
 485        --  Link to older string with same hash index.
 486        --  This link is maintained only for the last 32K strings.
 487        --  An index in this array is thus a window index modulo 32K.
 488        head : array (0 .. unsigned (HASH_SIZE - 1)) of Pos;
 489        --  Heads of the hash chains or NIL.
 490        window_size : ulg;
 491        --  window size, 2*WSIZE except for MMAP or BIG_MEM, where it is the
 492        --  input file length plus MIN_LOOKAHEAD.
 493        sliding : Boolean;  --  Set to False when the input file is already in memory  [was: int]
 494        ins_h : unsigned;   --  hash index of string to be inserted
 495        MIN_MATCH : constant Integer_M32 := Integer_M32 (Threshold) + 1;    --  Deflate: 3
 496        MAX_MATCH : constant Integer_M32 := Integer_M32 (Look_Ahead);       --  Deflate: 258
 497        --  Minimum amount of lookahead, except at the end of the input file.
 498        MIN_LOOKAHEAD : constant Integer_M32 := MAX_MATCH + MIN_MATCH + 1;  --  Deflate: 262
 499        --  This LZ77 compression doesn't use the full possible distance range: 32507..32768 unused!
 500        MAX_DIST : constant Integer_M32 := WSIZE - MIN_LOOKAHEAD;  --  Deflate: 32506
 501        H_SHIFT : constant Integer := Integer ((HASH_BITS + MIN_MATCH - 1) / MIN_MATCH);
 502        --  Number of bits by which ins_h and del_h must be shifted at each
 503        --  input step. It must be such that after MIN_MATCH steps, the oldest
 504        --  byte no longer takes part in the hash key, that is:
 505        --  H_SHIFT * MIN_MATCH >= HASH_BITS
 506        prev_length : Natural_M32; --  [was: unsigned]
 507        --  Length of the best match at previous step. Matches not greater than this
 508        --  are discarded. This is used in the lazy match evaluation.
 509        strstart    : Natural_M32;   --  start of string to insert [was: unsigned]
 510        match_start : Natural_M32;   --  start of matching string [was: unsigned]
 511        eofile      : Boolean;       --  flag set at end of input file [was: int]
 512        lookahead   : Natural_M32;   --  number of valid bytes ahead in window  [was: unsigned]
 513        max_chain_length : unsigned;
 514        --  To speed up deflation, hash chains are never searched beyond this length.
 515        --  A higher limit improves compression ratio but degrades the speed.
 516        max_lazy_match : Natural_M32;  --  [was: unsigned]
 517        --  Attempt to find a better match only when the current match is strictly
 518        --  smaller than this value. This mechanism is used only for compression
 519        --  levels >= 4.
 520        good_match : Natural_M32;  --  [was: unsigned]
 521        --  Use a faster search when the previous match is longer than this
 522        nice_match : Integer_M32;  --  Stop searching when current match exceeds this
 523        --  Values for max_lazy_match, good_match, nice_match and max_chain_length,
 524        --  depending on the desired pack level (0..9). The values given below have
 525        --  been tuned to exclude worst case performance for pathological files.
 526        --  Better values may be found for specific files.
 527        type config is record
 528          good_length  : Natural_M32;  --  reduce lazy search above this match length [was: ush]
 529          max_lazy     : Natural_M32;  --  do not perform lazy search above this match length
 530          nice_length  : Integer_M32;  --  quit search above this match length
 531          max_chain    : ush;
 532        end record;
 533  
 534        configuration_table : constant array (0 .. 10) of config := (
 535        --  good lazy nice chain
 536            (0,    0,  0,    0),    --  0: store only
 537            (4,    4,  8,    4),    --  1: maximum speed, no lazy matches
 538            (4,    5, 16,    8),
 539            (4,    6, 32,   32),
 540            (4,    4, 16,   16),    --  4: lazy matches
 541            (8,   16, 32,   32),
 542            (8,   16, 128, 128),
 543            (8,   32, 128, 256),
 544            (32, 128, 258, 1024),
 545            (32, 258, 258, 4096),   --  9: maximum compression
 546            (34, 258, 258, 4096));  --  "secret" variant of level 9
 547  
 548        --  Update a hash value with the given input byte
 549        --  IN  assertion: all calls to to UPDATE_HASH are made with consecutive
 550        --     input characters, so that a running hash key can be computed from the
 551        --     previous key instead of complete recalculation each time.
 552  
 553        procedure UPDATE_HASH (h : in out unsigned; c : Byte) is
 554        pragma Inline (UPDATE_HASH);
 555        begin
 556          h := (unsigned (Shift_Left (Unsigned_32 (h), H_SHIFT)) xor unsigned (c)) and HASH_MASK;
 557        end UPDATE_HASH;
 558  
 559        --  Insert string starting at s in the dictionary and set match_head to the previous head
 560        --  of the hash chain (the most recent string with same hash key). Return
 561        --  the previous length of the hash chain.
 562        --  IN  assertion: all calls to to INSERT_STRING are made with consecutive
 563        --     input characters and the first MIN_MATCH bytes of s are valid
 564        --     (except for the last MIN_MATCH-1 bytes of the input file).
 565  
 566        procedure INSERT_STRING (s : Integer_M32; match_head : out Natural_M32) is
 567        pragma Inline (INSERT_STRING);
 568        begin
 569          UPDATE_HASH (ins_h, window (s + MIN_MATCH - 1));
 570          match_head := Natural_M32 (head (ins_h));
 571          prev (unsigned (s) and WMASK) := Pos (match_head);
 572          head (ins_h) := Pos (s);
 573        end INSERT_STRING;
 574  
 575        procedure Read_buf (from : Integer_M32; amount : unsigned; actual : out Integer_M32) is
 576          need : unsigned := amount;
 577        begin
 578          --  put_line("Read buffer: from:" & from'img & ";  amount:" & amount'img);
 579          actual := 0;
 580          while need > 0 and then More_Bytes loop
 581            window (from + actual) := Read_Byte;
 582            actual := actual + 1;
 583            need := need - 1;
 584          end loop;
 585          --  put_line("Read buffer: actual:" & actual'img);
 586        end Read_buf;
 587  
 588        --  Fill the window when the lookahead becomes insufficient.
 589        --  Updates strstart and lookahead, and sets eofile if end of input file.
 590        --
 591        --  IN assertion: lookahead < MIN_LOOKAHEAD && strstart + lookahead > 0
 592        --  OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
 593        --     At least one byte has been read, or eofile is set; file reads are
 594        --     performed for at least two bytes (required for the translate_eol option).
 595  
 596        procedure Fill_window is
 597          more : unsigned;
 598          m : Pos;
 599          n : Natural_M32;
 600        begin
 601          loop
 602            more := unsigned (window_size - ulg (lookahead) - ulg (strstart));
 603            if False then  --  C: "if (more == (unsigned)EOF) {" ?... GdM: seems a 16-bit code for EOF
 604              --  Very unlikely, but possible on 16 bit machine if strstart == 0
 605              --  and lookahead == 1 (input done one byte at time)
 606              more := more - 1;
 607            elsif strstart >= WSIZE + MAX_DIST and then sliding then
 608              --  By the IN assertion, the window is not empty so we can't confuse
 609              --  more == 0 with more == 64K on a 16 bit machine.
 610              window (0 .. WSIZE - 1) := window (WSIZE .. 2 * WSIZE - 1);
 611              --  GdM: in rare cases (e.g. level 9 on test file "enwik8"), match_start happens
 612              --  to be < WSIZE. We do as in the original 16-bit C code: mod 2**16, such that the
 613              --  index is the window's range.
 614              --  This assumes WSIZE = 2**15, which is checked at startup of LZ77_using_IZ.
 615              --  Very likely, match_start is garbage anyway - see http://sf.net/p/infozip/bugs/49/
 616              match_start := Natural_M32 (Unsigned_16 (match_start) - Unsigned_16 (WSIZE mod (2**16)));
 617              strstart    := strstart - WSIZE; -- we now have strstart >= MAX_DIST:
 618              for nn in 0 .. unsigned'(HASH_SIZE - 1) loop
 619                m := head (nn);
 620                if m >= Pos (WSIZE) then
 621                  head (nn) := m - Pos (WSIZE);
 622                else
 623                  head (nn) := NIL;
 624                end if;
 625              end loop;
 626              --
 627              for nn in 0 .. unsigned (WSIZE - 1) loop
 628                m := prev (nn);
 629                if m >= Pos (WSIZE) then
 630                  prev (nn) := m - Pos (WSIZE);
 631                else
 632                  prev (nn) := NIL;
 633                end if;
 634                --  If n is not on any hash chain, prev[n] is garbage but its value will never be used.
 635              end loop;
 636              more := more + unsigned (WSIZE);
 637            end if;
 638            exit when eofile;
 639            --  If there was no sliding:
 640            --     strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
 641            --     more == window_size - lookahead - strstart
 642            --  => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
 643            --  => more >= window_size - 2*WSIZE + 2
 644            --  In the MMAP or BIG_MEM case (not yet supported in gzip),
 645            --    window_size == input_size + MIN_LOOKAHEAD  &&
 646            --    strstart + lookahead <= input_size => more >= MIN_LOOKAHEAD.
 647            --  Otherwise, window_size == 2*WSIZE so more >= 2.
 648            --  If there was sliding, more >= WSIZE. So in all cases, more >= 2.
 649            --
 650            pragma Assert (more >= 2, "more < 2");
 651            --
 652            Read_buf (strstart + lookahead, more, n);
 653            if n = 0 then
 654              eofile := True;
 655            else
 656              lookahead := lookahead + n;
 657            end if;
 658            exit when lookahead >= MIN_LOOKAHEAD or eofile;
 659          end loop;
 660          --  put_line("Fill done - eofile = " & eofile'img);
 661        end Fill_window;
 662  
 663        --  Initialize the "longest match" routines for a new file
 664        --
 665        --  IN assertion: window_size is > 0 if the input file is already read or
 666        --     mapped in the window array, 0 otherwise. In the first case,
 667        --     window_size is sufficient to contain the whole input file plus
 668        --     MIN_LOOKAHEAD bytes (to avoid referencing memory beyond the end
 669        --     of window when looking for matches towards the end).
 670  
 671        procedure LM_Init (pack_level : Natural) is
 672        begin
 673          --  Do not slide the window if the whole input is already in memory (window_size > 0)
 674          sliding := False;
 675          if window_size = 0 then
 676            sliding := True;
 677            window_size := 2 * ulg (WSIZE);
 678          end if;
 679          --  Initialize the hash table.
 680          --  prev will be initialized on the fly.
 681          head := (others => NIL);
 682          --  Set the default configuration parameters:
 683          max_lazy_match   := configuration_table (pack_level).max_lazy;
 684          good_match       := configuration_table (pack_level).good_length;
 685          nice_match       := configuration_table (pack_level).nice_length;
 686          max_chain_length := configuration_table (pack_level).max_chain;
 687          --  Info-Zip comment: ??? reduce max_chain_length for binary files
 688          strstart := 0;
 689          Read_buf (0, unsigned (WSIZE), lookahead);
 690          if lookahead = 0 then
 691            eofile := True;
 692            return;
 693          end if;
 694          eofile := False;
 695          --  Make sure that we always have enough lookahead. This is important
 696          --  if input comes from a device such as a tty.
 697          if lookahead < MIN_LOOKAHEAD then
 698            Fill_window;
 699          end if;
 700          ins_h := 0;
 701          for j in 0 .. Natural_M32 (MIN_MATCH) - 2 loop
 702            UPDATE_HASH (ins_h, window (j));
 703          end loop;
 704          --  If lookahead < MIN_MATCH, ins_h is garbage, but this is
 705          --  not important since only literal bytes will be emitted.
 706        end LM_Init;
 707  
 708        --  Set match_start to the longest match starting at the given string and
 709        --  return its length. Matches shorter or equal to prev_length are discarded,
 710        --  in which case the result is equal to prev_length and match_start is
 711        --  garbage.
 712        --  IN assertions: current_match is the head of the hash chain for the current
 713        --    string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
 714  
 715        procedure Longest_Match (current_match : in out Integer_M32; longest : out Integer_M32) is
 716          chain_length : unsigned := max_chain_length;  --  max hash chain length
 717          scan         : Integer_M32 := strstart;       --  current string
 718          match        : Integer_M32;                   --  matched string
 719          len          : Integer_M32;                   --  length of current match
 720          best_len     : Integer_M32 := prev_length;    --  best match length so far
 721          limit        : Natural_M32;  --  [was: IPos]
 722          strend       : constant Integer_M32 := strstart + MAX_MATCH;
 723          scan_end     : Integer_M32 := scan + best_len;
 724        begin
 725          --  Stop when current_match becomes <= limit. To simplify the code,
 726          --  we prevent matches with the string of window index 0.
 727          if strstart > MAX_DIST then
 728            limit := strstart - MAX_DIST;
 729          else
 730            limit := NIL;
 731          end if;
 732          --  Do not waste too much time if we already have a good match:
 733          if prev_length >= good_match then
 734            chain_length := chain_length / 4;
 735          end if;
 736          pragma Assert
 737            (strstart <= Integer_M32 (window_size) - MIN_LOOKAHEAD,
 738             "insufficient lookahead");  --  In deflate.c
 739          loop
 740            if current_match >= strstart then
 741              --  Added 2020-11-07. The file test/sample.jpg bombs the assertion a few lines later.
 742              longest := MIN_MATCH - 1;
 743              return;
 744            end if;
 745            pragma Assert (current_match < strstart, "no future");  --  In deflate.c
 746            match := current_match;
 747            --  Skip to next match if the match length cannot increase
 748            --  or if the match length is less than 2:
 749            --
 750            --  NB: this is the Not-UNALIGNED_OK variant in the C code.
 751            --      Translation of the UNALIGNED_OK variant is left as an exercise ;-).
 752            --      (!! worth a try: GNAT optimizes window(match..match+1[3]) to 16[32] bit)
 753            --
 754            if window (match + best_len)     /= window (scan_end) or else
 755               window (match + best_len - 1) /= window (scan_end - 1) or else
 756               window (match)                /= window (scan) or else
 757               window (match + 1)            /= window (scan + 1)
 758            then
 759              match := match + 1;  --  C: continue
 760            else
 761              --  The check at best_len - 1 can be removed because it will be made
 762              --  again later. (This heuristic is not always a win.)
 763              --
 764              --  It is not necessary to compare window(scan + 2) and window(match + 2) since they
 765              --  are always equal when the other bytes match, given that
 766              --  the hash keys are equal and that HASH_BITS >= 8.
 767              scan := scan + 2;
 768              match := match + 2;
 769              --  C: The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
 770              --     It is easy to get rid of this optimization if necessary.
 771              --  Ada: see the "else" part below.
 772              if MAX_MATCH = 258 then
 773                --  We check for insufficient lookahead only every 8th comparison;
 774                --  the 256th check will be made at strstart + 258.
 775                loop
 776                  scan := scan + 1;
 777                  match := match + 1;
 778                  exit when window (scan) /= window (match);
 779                  scan := scan + 1;
 780                  match := match + 1;
 781                  exit when window (scan) /= window (match);
 782                  scan := scan + 1;
 783                  match := match + 1;
 784                  exit when window (scan) /= window (match);
 785                  scan := scan + 1;
 786                  match := match + 1;
 787                  exit when window (scan) /= window (match);
 788                  scan := scan + 1;
 789                  match := match + 1;
 790                  exit when window (scan) /= window (match);
 791                  scan := scan + 1;
 792                  match := match + 1;
 793                  exit when window (scan) /= window (match);
 794                  scan := scan + 1;
 795                  match := match + 1;
 796                  exit when window (scan) /= window (match);
 797                  scan := scan + 1;
 798                  match := match + 1;
 799                  exit when window (scan) /= window (match) or else scan >= strend;
 800                end loop;
 801              else
 802                --  We check for insufficient lookahead after every comparison.
 803                loop
 804                  scan := scan + 1;
 805                  match := match + 1;
 806                  exit when window (scan) /= window (match) or else scan >= strend;
 807                end loop;
 808              end if;
 809              --  Assert(scan <= window+(unsigned)(window_size-1), "wild scan");  ??
 810              len := MAX_MATCH - (strend - scan);
 811              scan := strend - MAX_MATCH;
 812              if len > best_len then
 813                match_start := current_match;
 814                best_len := len;
 815                exit when len >= nice_match;
 816                scan_end  := scan + best_len;
 817              end if;
 818            end if;
 819            current_match := Integer_M32 (prev (unsigned (current_match) and WMASK));
 820            exit when current_match <= limit;
 821            chain_length := chain_length - 1;
 822            exit when chain_length = 0;
 823          end loop;
 824          longest := best_len;
 825        end Longest_Match;
 826  
 827        procedure LZ77_part_of_IZ_Deflate is
 828          hash_head  : Natural_M32 := NIL;              --  head of hash chain
 829          prev_match : Natural_M32;                     --  previous match  [was: IPos]
 830          match_available : Boolean := False;           --  set if previous match exists
 831          match_length : Natural_M32 := MIN_MATCH - 1;  --  length of best match
 832          max_insert : Natural_M32;
 833        begin
 834          match_start := 0;  --  NB: no initialization in deflate.c
 835          --  NB: level <= 3 would call deflate_fast;
 836          --
 837          --  Process the input block.
 838          while lookahead /= 0 loop
 839            --  Insert the string window(strstart .. strstart + 2) in the
 840            --  dictionary, and set hash_head to the head of the hash chain:
 841            if lookahead >= MIN_MATCH then
 842              INSERT_STRING (strstart, hash_head);
 843            end if;
 844            --  Find the longest match, discarding those <= prev_length.
 845            prev_length  := match_length;
 846            prev_match   := match_start;
 847            match_length := MIN_MATCH - 1;
 848            if hash_head /= NIL and then
 849               prev_length < max_lazy_match and then
 850               strstart - hash_head <= MAX_DIST
 851            then
 852              --  To simplify the code, we prevent matches with the string
 853              --  of window index 0 (in particular we have to avoid a match
 854              --  of the string with itself at the start of the input file).
 855              --
 856              --  Do not look for matches beyond the end of the input.
 857              --  This is necessary to make deflate deterministic.
 858              if nice_match > lookahead then
 859                nice_match := lookahead;
 860              end if;
 861              Longest_Match (hash_head, match_length);
 862              --  Longest_Match sets match_start
 863              if match_length > lookahead then
 864                match_length := lookahead;
 865              end if;
 866              --  Ignore a length 3 match if it is too distant:
 867              if match_length = MIN_MATCH and then strstart - match_start > TOO_FAR then
 868                --  If prev_match is also MIN_MATCH, match_start is garbage
 869                --  but we will ignore the current match anyway.
 870                match_length := MIN_MATCH - 1;
 871              end if;
 872            end if;
 873            --  If there was a match at the previous step and the current
 874            --  match is not better, output the previous match:
 875            if prev_length >= MIN_MATCH and then match_length <= prev_length then
 876              max_insert := strstart + lookahead - MIN_MATCH;
 877              --  C: in DEBUG mode: check_match(strstart-1, prev_match, prev_length);
 878              --
 879              ------------------------------------
 880              --  Output a Distance-Length code --
 881              ------------------------------------
 882              Write_DL_Code (Positive (strstart - 1 - prev_match), Positive (prev_length));
 883              --  Insert in hash table all strings up to the end of the match.
 884              --  strstart-1 and strstart are already inserted.
 885              lookahead := lookahead - (prev_length - 1);
 886              prev_length := prev_length - 2;
 887              loop
 888                strstart := strstart + 1;
 889                if strstart <= max_insert then
 890                  INSERT_STRING (strstart, hash_head);
 891                  --  strstart never exceeds WSIZE - MAX_MATCH, so there
 892                  --  are always MIN_MATCH bytes ahead.
 893                end if;
 894                prev_length := prev_length - 1;
 895                exit when prev_length = 0;
 896              end loop;
 897              strstart := strstart + 1;
 898              match_available := False;
 899              match_length := MIN_MATCH - 1;
 900            elsif match_available then
 901              --  If there was no match at the previous position, output a
 902              --  single literal. If there was a match but the current match
 903              --  is longer, truncate the previous match to a single literal.
 904              --
 905              ------------------------
 906              --  Output a literal  --
 907              ------------------------
 908              Write_Literal (window (strstart - 1));
 909              strstart := strstart + 1;
 910              lookahead := lookahead - 1;
 911            else
 912              --  There is no previous match to compare with, wait for the next step to decide.
 913              match_available := True;
 914              strstart := strstart + 1;
 915              lookahead := lookahead - 1;
 916            end if;
 917            --  Assert(strstart <= isize && lookahead <= isize, "a bit too far");
 918            --
 919            --  Make sure that we always have enough lookahead, except
 920            --  at the end of the input file. We need MAX_MATCH bytes
 921            --  for the next match, plus MIN_MATCH bytes to insert the
 922            --  string following the next match.
 923            if lookahead < MIN_LOOKAHEAD then
 924              Fill_window;
 925            end if;
 926          end loop;
 927          -----------------------------------
 928          --  Output last literal, if any  --
 929          -----------------------------------
 930          if match_available then
 931            Write_Literal (window (strstart - 1));
 932          end if;
 933        end LZ77_part_of_IZ_Deflate;
 934  
 935        Code_too_clever : exception;
 936      begin
 937        if Look_Ahead /= 258 or String_buffer_size /= 2 ** 15 or Threshold /= 2 then
 938          raise Code_too_clever;  --  was optimized for these parameters
 939        end if;
 940        window_size := 0;
 941        LM_Init (level);
 942        LZ77_part_of_IZ_Deflate;
 943      end LZ77_using_IZ;
 944  
 945      ---------------------------------------------------------------------
 946      --  BT4  -  Binary tree of match positions selected with           --
 947      --          the leading 2 to 4 bytes of each possible match.       --
 948      ---------------------------------------------------------------------
 949  
 950      --  Based on BT4.java and LZMAEncoderFast.java by Lasse Collin,
 951      --  itself based on LzFind.c by Igor Pavlov.
 952  
 953      procedure LZ77_using_BT4 is
 954        MATCH_LEN_MIN : constant Integer := Threshold + 1;
 955        --
 956        readPos     : Integer := -1;
 957        cur_literal : Byte;
 958        readLimit   : Integer := -1;
 959        finishing   : constant Boolean := False;
 960        writePos    : Integer :=  0;
 961        pendingSize : Integer :=  0;
 962        --
 963        OPTS : constant := 4096;
 964        EXTRA_SIZE_BEFORE : constant :=  OPTS;
 965        EXTRA_SIZE_AFTER  : constant :=  OPTS;
 966  
 967        keepSizeBefore : constant Integer := EXTRA_SIZE_BEFORE + String_buffer_size;
 968        keepSizeAfter  : constant Integer := EXTRA_SIZE_AFTER  + Look_Ahead;
 969        reserveSize : constant Integer :=
 970          Integer'Min (
 971            String_buffer_size / 2 +
 972            256 * (2 ** 10),  --  256 KiB
 973            512 * (2 ** 20)   --  512 MiB
 974          );
 975        getBufSize : constant Integer := keepSizeBefore + keepSizeAfter + reserveSize;
 976  
 977        type Int_array is array (Natural range <>) of Integer;
 978        type p_Int_array is access Int_array;
 979        procedure Dispose is new Ada.Unchecked_Deallocation (Int_array, p_Int_array);
 980  
 981        procedure Normalize (positions : in out Int_array; normalizationOffset : Integer) is
 982        begin
 983          for i in 0 .. positions'Length - 1 loop
 984            if positions (i) <= normalizationOffset then
 985              positions (i) := 0;
 986            else
 987              positions (i) := positions (i) - normalizationOffset;
 988            end if;
 989          end loop;
 990        end Normalize;
 991  
 992        function Get_Available return Integer is
 993        pragma Inline (Get_Available);
 994        begin
 995           --  Compared to the Java version: - 1 shift for getting readPos
 996           --  in buf'Range upon: cur_literal := buf (readPos);
 997          return writePos - readPos - 1;
 998        end Get_Available;
 999  
1000        function Move_Pos (requiredForFlushing, requiredForFinishing : Integer) return Integer is
1001          --  Java name: movePos.
1002          avail : Integer;
1003        begin
1004          pragma Assert (requiredForFlushing >= requiredForFinishing);
1005          readPos := readPos + 1;
1006          avail   := Get_Available;
1007          if avail < requiredForFlushing then
1008            if avail < requiredForFinishing or else not finishing
1009            then
1010              pendingSize := pendingSize + 1;
1011              --  GdM: This causes cyclicPos and lzpos not being in sync with readPos.
1012              --       The pendingSize value is there for catching up.
1013              avail := 0;
1014            end if;
1015          end if;
1016          return avail;
1017        end Move_Pos;
1018  
1019        function getHash4Size return Integer is
1020          h : Unsigned_32 := Unsigned_32 (String_buffer_size - 1);
1021        begin
1022          h := h or Shift_Right (h, 1);
1023          h := h or Shift_Right (h, 2);
1024          h := h or Shift_Right (h, 4);
1025          h := h or Shift_Right (h, 8);
1026          h := Shift_Right (h, 1);
1027          h := h or 16#FFFF#;  --  LzFind.c: "don't change it! It's required for Deflate"
1028          if h > 2 ** 24 then
1029            h := Shift_Right (h, 1);
1030          end if;
1031          return Integer (h + 1);
1032        end getHash4Size;
1033  
1034        type p_Byte_Array is access Byte_Array;
1035        procedure Dispose is new Ada.Unchecked_Deallocation (Byte_Array, p_Byte_Array);
1036  
1037        package Hash234 is
1038          HASH_2_SIZE : constant := 2 ** 10;
1039          HASH_2_MASK : constant := HASH_2_SIZE - 1;
1040          HASH_3_SIZE : constant := 2 ** 16;
1041          HASH_3_MASK : constant := HASH_3_SIZE - 1;
1042          hash_4_size : constant Integer := getHash4Size;
1043          hash_4_mask : constant Unsigned_32 := Unsigned_32 (hash_4_size) - 1;
1044          --
1045          hash2Table : Int_array (0 .. HASH_2_SIZE - 1) := (others => 0);  --  [Initialization added]
1046          hash3Table : Int_array (0 .. HASH_3_SIZE - 1) := (others => 0);  --  [Initialization added]
1047          hash4Table : p_Int_array;
1048          --
1049          hash2Value, hash3Value, hash4Value : Unsigned_32 := 0;
1050          --
1051          procedure calcHashes (buf : Byte_Array; off : Integer);
1052          procedure updateTables (pos : Integer);
1053          procedure Normalize (normalizeOffset : Integer);
1054        end Hash234;
1055  
1056        package body Hash234 is
1057  
1058          crcTable : array (Byte) of Unsigned_32;
1059          CRC32_POLY : constant := 16#EDB8_8320#;
1060  
1061          procedure calcHashes (buf : Byte_Array; off : Integer) is
1062            temp : Unsigned_32 := crcTable (buf (off)) xor Unsigned_32 (buf (off + 1));
1063          begin
1064            hash2Value := temp and HASH_2_MASK;
1065            temp := temp xor Shift_Left (Unsigned_32 (buf (off + 2)), 8);
1066            hash3Value := temp and HASH_3_MASK;
1067            temp := temp xor Shift_Left (crcTable (buf (off + 3)), 5);
1068            hash4Value := temp and hash_4_mask;
1069          end calcHashes;
1070  
1071          procedure updateTables (pos : Integer) is
1072          begin
1073            hash2Table (Integer (hash2Value)) := pos;
1074            hash3Table (Integer (hash3Value)) := pos;
1075            hash4Table (Integer (hash4Value)) := pos;
1076          end updateTables;
1077  
1078          procedure Normalize (normalizeOffset : Integer) is
1079          begin
1080            Normalize (hash2Table, normalizeOffset);
1081            Normalize (hash3Table, normalizeOffset);
1082            Normalize (hash4Table.all, normalizeOffset);
1083          end Normalize;
1084  
1085          r : Unsigned_32;
1086        begin
1087          --  NB: heap allocation used only for convenience because of
1088          --      small default stack sizes on some compilers.
1089          hash4Table := new Int_array (0 .. hash_4_size - 1);
1090          hash4Table.all := (others => 0);  --  [Initialization added]
1091          for i in Byte loop
1092            r := Unsigned_32 (i);
1093            for j in 0 .. 7 loop
1094              if (r and 1) = 0 then
1095                r := Shift_Right (r, 1);
1096              else
1097                r := Shift_Right (r, 1) xor CRC32_POLY;
1098              end if;
1099            end loop;
1100            crcTable (i) := r;
1101          end loop;
1102        end Hash234;
1103  
1104        Nice_Length : constant Integer := Integer'Min (162, Look_Ahead);  --  const. was 64
1105        Depth_Limit : constant := 48;  --  Alternatively: 16 + Nice_Length / 2
1106  
1107        cyclicSize : constant Integer := String_buffer_size;  --  Had: + 1;
1108        cyclicPos  : Integer := -1;
1109        lzPos      : Integer := cyclicSize;
1110  
1111        max_dist : constant Integer := cyclicSize - (Look_Ahead + 2);
1112        --  NB: 2020-11-04: added "- (Look_Ahead + 2)" to prevent corruption of
1113        --  the expansion buffer in LZMA.Encoding when DL codes are tested in front
1114        --  of the actual writes, before actual entropy compression (since rev. #850).
1115  
1116        package BT4_Algo is
1117          procedure Skip (len : Natural);
1118          pragma Inline (Skip);
1119          procedure Read_One_and_Get_Matches (matches : out Matches_Type);
1120        end BT4_Algo;
1121  
1122        buf : p_Byte_Array;
1123        tree : p_Int_array;
1124  
1125        package body BT4_Algo is
1126  
1127          function Move_Pos_in_BT4 return Integer is
1128            --  Java name: movePos.
1129            avail : constant Integer :=
1130              Move_Pos (requiredForFlushing  => Nice_Length,
1131                        requiredForFinishing => 4);
1132            normalizationOffset : Integer;
1133          begin
1134            --  Put_Line ("BT4_Algo.Move_Pos_in_BT4");
1135            if avail /= 0 then
1136              lzPos := lzPos + 1;
1137              if lzPos = Integer'Last then
1138                normalizationOffset := Integer'Last - cyclicSize;
1139                Hash234.Normalize (normalizationOffset);
1140                Normalize (tree.all, normalizationOffset);
1141                lzPos := lzPos - normalizationOffset;
1142              end if;
1143              cyclicPos := cyclicPos + 1;
1144              if cyclicPos = cyclicSize then
1145                --  Put_Line("cyclicPos zeroed");
1146                cyclicPos := 0;
1147              end if;
1148            end if;
1149            return avail;
1150          end Move_Pos_in_BT4;
1151  
1152          Null_position : constant := -1;  --  LzFind.c: kEmptyHashValue, 0
1153  
1154          procedure Skip_and_Update_Tree (niceLenLimit : Integer; currentMatch : in out Integer) is
1155            delta0, depth, ptr0, ptr1, pair, len, len0, len1 : Integer;
1156          begin
1157            --  Put("BT4_Algo.Skip_and_Update_Tree... ");
1158            depth := Depth_Limit;
1159            ptr0  := cyclicPos * 2 + 1;
1160            ptr1  := cyclicPos * 2;
1161            len0  := 0;
1162            len1  := 0;
1163            loop
1164              delta0 := lzPos - currentMatch;
1165              if depth = 0 or else delta0 >= max_dist then
1166                tree (ptr0) := Null_position;
1167                tree (ptr1) := Null_position;
1168                return;
1169              end if;
1170              depth := depth - 1;
1171              if cyclicPos - delta0 < 0 then
1172                pair := cyclicSize;
1173              else
1174                pair := 0;
1175              end if;
1176              pair := (cyclicPos - delta0 + pair) * 2;
1177              len  := Integer'Min (len0, len1);
1178              --  Match ?
1179              if buf (readPos + len - delta0) = buf (readPos + len) then
1180                --  No need to look for longer matches than niceLenLimit
1181                --  because we only are updating the tree, not returning
1182                --  matches found to the caller.
1183                loop
1184                  len := len + 1;
1185                  if len = niceLenLimit then
1186                    tree (ptr1) := tree (pair);
1187                    tree (ptr0) := tree (pair + 1);
1188                    return;
1189                  end if;
1190                  exit when buf (readPos + len - delta0) /= buf (readPos + len);
1191                end loop;
1192              end if;
1193              --  Bytes are no more matching. The past value is either smaller...
1194              if buf (readPos + len - delta0) < buf (readPos + len) then
1195                tree (ptr1) := currentMatch;
1196                ptr1 := pair + 1;
1197                currentMatch := tree (ptr1);
1198                len1 := len;
1199              else  --  ... or larger
1200                tree (ptr0) := currentMatch;
1201                ptr0 := pair;
1202                currentMatch := tree (ptr0);
1203                len0 := len;
1204              end if;
1205            end loop;
1206          end Skip_and_Update_Tree;
1207  
1208          procedure Skip (len : Natural) is
1209            --
1210            procedure Skip_one is
1211            pragma Inline (Skip_one);
1212              niceLenLimit, avail, currentMatch : Integer;
1213            begin
1214              niceLenLimit := Nice_Length;
1215              avail := Move_Pos_in_BT4;
1216              if avail < niceLenLimit then
1217                if avail = 0 then
1218                  return;
1219                end if;
1220                niceLenLimit := avail;
1221              end if;
1222              Hash234.calcHashes (buf.all, readPos);
1223              currentMatch := Hash234.hash4Table (Integer (Hash234.hash4Value));
1224              Hash234.updateTables (lzPos);
1225              Skip_and_Update_Tree (niceLenLimit, currentMatch);
1226            end Skip_one;
1227            --
1228          begin
1229            for count in reverse 1 .. len loop
1230              Skip_one;
1231            end loop;
1232          end Skip;
1233  
1234          procedure Read_One_and_Get_Matches (matches : out Matches_Type) is
1235            matchLenLimit : Integer := Look_Ahead;
1236            niceLenLimit  : Integer := Nice_Length;
1237            avail : Integer;
1238            delta0, delta2, delta3, currentMatch,
1239            lenBest, depth, ptr0, ptr1, pair, len, len0, len1 : Integer;
1240          begin
1241            --  Put("BT4_Algo.Get_Matches... ");
1242            matches.count := 0;
1243            avail := Move_Pos_in_BT4;
1244            if avail < matchLenLimit then
1245              if avail = 0 then
1246                return;
1247              end if;
1248              matchLenLimit := avail;
1249              if niceLenLimit > avail then
1250                niceLenLimit := avail;
1251              end if;
1252            end if;
1253            --
1254            Hash234.calcHashes (buf.all, readPos);
1255            delta2 := lzPos - Hash234.hash2Table (Integer (Hash234.hash2Value));
1256            delta3 := lzPos - Hash234.hash3Table (Integer (Hash234.hash3Value));
1257            currentMatch :=   Hash234.hash4Table (Integer (Hash234.hash4Value));
1258            Hash234.updateTables (lzPos);
1259            --
1260            lenBest := 0;
1261            --  See if the hash from the first two bytes found a match.
1262            --  The hashing algorithm guarantees that if the first byte
1263            --  matches, also the second byte does, so there's no need to
1264            --  test the second byte.
1265            if delta2 < max_dist and then buf (readPos - delta2) = buf (readPos) then
1266              --  Match of length 2 found and checked.
1267              lenBest := 2;
1268              matches.count := 1;
1269              matches.dl (matches.count).length := 2;
1270              matches.dl (matches.count).distance := delta2;
1271            end if;
1272            --  See if the hash from the first three bytes found a match that
1273            --  is different from the match possibly found by the two-byte hash.
1274            --  Also here the hashing algorithm guarantees that if the first byte
1275            --  matches, also the next two bytes do.
1276            if delta2 /= delta3 and then delta3 < max_dist
1277                    and then buf (readPos - delta3) = buf (readPos)
1278            then
1279              --  Match of length 3 found and checked.
1280              lenBest := 3;
1281              matches.count := matches.count + 1;
1282              matches.dl (matches.count).distance := delta3;
1283              delta2 := delta3;
1284            end if;
1285            --  If a match was found, see how long it is.
1286            if matches.count > 0 then
1287              while lenBest < matchLenLimit and then buf (readPos + lenBest - delta2)
1288                                                   = buf (readPos + lenBest)
1289              loop
1290                lenBest := lenBest + 1;
1291              end loop;
1292              matches.dl (matches.count).length := lenBest;
1293              --  Return if it is long enough (niceLen or reached the end of the dictionary).
1294              if lenBest >= niceLenLimit then
1295                Skip_and_Update_Tree (niceLenLimit, currentMatch);
1296                return;
1297              end if;
1298            end if;
1299            --  A long enough match wasn't found so easily.
1300            --  Look for better matches from the binary tree.
1301            if lenBest < 3 then
1302              lenBest := 3;
1303            end if;
1304            depth := Depth_Limit;
1305            ptr0  := cyclicPos * 2 + 1;
1306            ptr1  := cyclicPos * 2;
1307            len0  := 0;
1308            len1  := 0;
1309            --
1310            loop
1311              delta0 := lzPos - currentMatch;
1312              --  Return if the search depth limit has been reached or
1313              --  if the distance of the potential match exceeds the
1314              --  dictionary size.
1315              if depth = 0 or else delta0 >= max_dist then
1316                tree (ptr0) := Null_position;
1317                tree (ptr1) := Null_position;
1318                return;
1319              end if;
1320              depth := depth - 1;
1321              --
1322              if cyclicPos - delta0 < 0 then
1323                pair := cyclicSize;
1324              else
1325                pair := 0;
1326              end if;
1327              pair := (cyclicPos - delta0 + pair) * 2;
1328              len  := Integer'Min (len0, len1);
1329              --  Match ?
1330              if buf (readPos + len - delta0) = buf (readPos + len) then
1331                loop
1332                  len := len + 1;
1333                  exit when len >= matchLenLimit
1334                    or else buf (readPos + len - delta0) /= buf (readPos + len);
1335                end loop;
1336                if len > lenBest then
1337                  lenBest := len;
1338                  matches.count := matches.count + 1;
1339                  matches.dl (matches.count).length := len;
1340                  matches.dl (matches.count).distance := delta0;
1341                  if len >= niceLenLimit then
1342                    tree (ptr1) := tree (pair);
1343                    tree (ptr0) := tree (pair + 1);
1344                    return;
1345                  end if;
1346                end if;
1347              end if;
1348              --  Bytes are no more matching. The past value is either smaller...
1349              if buf (readPos + len - delta0) < buf (readPos + len) then
1350                tree (ptr1) := currentMatch;
1351                ptr1 := pair + 1;
1352                currentMatch := tree (ptr1);
1353                len1 := len;
1354              else  --  ... or larger
1355                tree (ptr0) := currentMatch;
1356                ptr0 := pair;
1357                currentMatch := tree (ptr0);
1358                len0 := len;
1359              end if;
1360            end loop;
1361          end Read_One_and_Get_Matches;
1362  
1363        begin
1364          --  NB: heap allocation used only for convenience because of
1365          --      small default stack sizes on some compilers.
1366          tree := new Int_array (0 .. cyclicSize * 2 - 1);
1367          for i in tree'Range loop
1368            tree (i) := Null_position;
1369          end loop;
1370        end BT4_Algo;
1371  
1372        --  Moves data from the end of the buffer to the beginning, discarding
1373        --  old data and making space for new input.
1374  
1375        procedure Move_Window is
1376          --  Java name: moveWindow.
1377          --  Align the move to a multiple of 16 bytes (LZMA-friendly, see pos_bits)
1378          moveOffset : constant Integer := ((readPos + 1 - keepSizeBefore) / 16) * 16;
1379          moveSize   : constant Integer := writePos - moveOffset;
1380        begin
1381          --  Put_Line("  Move window, size=" & moveSize'Img & " offset=" & moveOffset'Img);
1382          buf (0 .. moveSize - 1) := buf (moveOffset .. moveOffset + moveSize - 1);
1383          readPos   := readPos   - moveOffset;
1384          readLimit := readLimit - moveOffset;
1385          writePos  := writePos  - moveOffset;
1386        end Move_Window;
1387  
1388       --  Copies new data into the buffer.
1389       function Fill_Window (len_initial : Integer) return Integer is
1390       --  Java name: fillWindow
1391  
1392         --  Process pending data that hasn't been ran through the match finder yet.
1393         --  Run it through the match finder now if there is enough new data
1394         --  available (readPos < readLimit) that the encoder may encode at
1395         --  least one more input byte.
1396         --
1397         procedure processPendingBytes is
1398           oldPendingSize : Integer;
1399         begin
1400           if pendingSize > 0 and then readPos < readLimit then
1401             readPos := readPos - pendingSize;
1402             oldPendingSize := pendingSize;
1403             pendingSize := 0;
1404             BT4_Algo.Skip (oldPendingSize);
1405           end if;
1406         end processPendingBytes;
1407         --
1408         len : Integer := len_initial;
1409         actual_len : Integer := 0;
1410       begin
1411          --  Put_Line("Fill window - start");
1412          --  Move the sliding window if needed.
1413          if readPos >= buf'Length - keepSizeAfter then
1414            Move_Window;
1415          end if;
1416  
1417          --  Try to fill the dictionary buffer up to its boundary.
1418          if len > buf'Length - writePos then
1419            len := buf'Length - writePos;
1420          end if;
1421  
1422          while len > 0 and then More_Bytes loop
1423            buf (writePos) := Read_Byte;
1424            writePos := writePos + 1;
1425            len := len - 1;
1426            actual_len := actual_len + 1;
1427          end loop;
1428  
1429          --  Set the new readLimit but only if there's enough data to allow
1430          --  encoding of at least one more byte.
1431          if writePos >= keepSizeAfter then
1432            readLimit := writePos - keepSizeAfter;
1433          end if;
1434  
1435          processPendingBytes;
1436  
1437          --  Put_Line("Fill window, requested=" & len_initial'Img & " actual=" & actual_len'Img);
1438          --  Tell the caller how much input we actually copied into the dictionary.
1439          return actual_len;
1440        end Fill_Window;
1441  
1442        function Compute_Match_Length (distance, length_limit : Integer) return Natural is
1443        pragma Inline (Compute_Match_Length);
1444          back_pos : constant Integer := readPos - distance;
1445          len : Integer := 0;
1446        begin
1447          if distance < 2 then
1448            return 0;
1449          end if;
1450          --  @ if readPos+len not in buf.all'Range then
1451          --  @   Put("**** readpos " & buf'Last'Img & readPos'Img);
1452          --  @ end if;
1453          --  @ if backPos+len not in buf.all'Range then
1454          --  @   Put("**** backpos " & buf'Last'Img & back_pos'Img);
1455          --  @ end if;
1456          while len < length_limit and then buf (readPos + len) = buf (back_pos + len) loop
1457            len := len + 1;
1458          end loop;
1459          return len;
1460        end Compute_Match_Length;
1461  
1462        readAhead : Integer := -1;  --  LZMAEncoder.java
1463        --  Small stack of recent distances used for LZMA.
1464        subtype Repeat_stack_range is Integer range 0 .. 3;
1465        --  1-based distances.
1466        rep_dist : array (Repeat_stack_range) of Distance_Type := (others => 1);
1467        len_rep_dist : array (Repeat_stack_range) of Natural := (others => 0);
1468  
1469        function Has_much_smaller_Distance (smallDist, bigDist : Distance_Type) return Boolean is
1470        pragma Inline (Has_much_smaller_Distance);
1471        begin
1472          return (smallDist - 1) < (bigDist - 1) / 128;
1473        end Has_much_smaller_Distance;
1474  
1475        best_length_for_rep_dist, best_rep_dist_index : Integer;
1476  
1477        procedure Read_One_and_Get_Matches (matches : out Matches_Type) is
1478          avail, len : Integer;
1479        begin
1480          readAhead := readAhead + 1;
1481          --
1482          BT4_Algo.Read_One_and_Get_Matches (matches);
1483          --
1484          if LZMA_friendly then
1485            best_length_for_rep_dist := 0;
1486            avail := Integer'Min (Get_Available, Look_Ahead);
1487            if avail >= MATCH_LEN_MIN then
1488              for rep in Repeat_stack_range loop
1489                len := Compute_Match_Length (rep_dist (rep), avail);
1490                len_rep_dist (rep) := len;
1491                --  Remember the index and length of the best repeated match.
1492                if len > best_length_for_rep_dist then
1493                  best_rep_dist_index      := rep;
1494                  best_length_for_rep_dist := len;
1495                end if;
1496              end loop;
1497            else
1498              for rep in Repeat_stack_range loop
1499                len_rep_dist (rep) := 0;  --  No match possible in any case.
1500              end loop;
1501            end if;
1502          end if;
1503        end Read_One_and_Get_Matches;
1504  
1505        procedure Get_supplemental_Matches_from_Repeat_Matches (matches : in out Matches_Type) is
1506          len, ins : Integer;
1507        begin
1508          if matches.count = 0 then
1509            if best_length_for_rep_dist >= MATCH_LEN_MIN then
1510              matches.dl (1).distance := rep_dist (best_rep_dist_index);
1511              matches.dl (1).length   := best_length_for_rep_dist;
1512              matches.count := 1;
1513            end if;
1514          end if;
1515          for rep in Repeat_stack_range loop
1516            len := len_rep_dist (rep);
1517            if len >= MATCH_LEN_MIN then
1518              ins := 0;
1519              for i in reverse 1 .. matches.count loop
1520                if len = matches.dl (i).length then
1521                  if rep_dist (rep) = matches.dl (i).distance then
1522                    null;  --  Identical match
1523                  else
1524                    --  Tie: insert the repeat match of same length into the list.
1525                    --  If the longest match strategy is applied, the second item is preferred.
1526                    if Has_much_smaller_Distance (matches.dl (i).distance, rep_dist (rep)) then
1527                      ins := i;      --  Insert before
1528                    else
1529                      ins := i + 1;  --  Insert after
1530                    end if;
1531                    exit;
1532                    --  Ada.Text_IO.Put_Line ("Tie");
1533                  end if;
1534                elsif i < matches.count then
1535                  if len > matches.dl (i).length and then len < matches.dl (i + 1).length then
1536                    --  Insert between existing lengths
1537                    ins := i + 1;
1538                    exit;
1539                  --  We don't add len as the shortest length (worsens compression).
1540                  ------
1541                  --  elsif i = 1
1542                  --    and then len >= MATCH_LEN_MIN
1543                  --    and then len >= matches.dl (1).length - 1  --  Some reluctance...
1544                  --  then
1545                  --    ins := 1;
1546                  end if;
1547                elsif len > matches.dl (i).length then
1548                  --  i = matches.count in this case: add as longest.
1549                  ins := i + 1;
1550                  exit;
1551                end if;
1552              end loop;
1553              --  We can insert this repeat match at position 'ins'.
1554              if ins > 0 then
1555                for i in reverse ins .. matches.count loop  --  Empty if ins > count.
1556                  matches.dl (i + 1) := matches.dl (i);
1557                end loop;
1558                matches.dl (ins).distance := rep_dist (rep);
1559                matches.dl (ins).length   := len;
1560                matches.count := matches.count + 1;
1561                exit;
1562              end if;
1563            end if;
1564          end loop;
1565          pragma Assert (Are_Matches_Sorted (matches));
1566        end Get_supplemental_Matches_from_Repeat_Matches;
1567  
1568        procedure Skip (len : Natural) is
1569        pragma Inline (Skip);
1570        begin
1571          readAhead := readAhead + len;
1572          BT4_Algo.Skip (len);
1573        end Skip;
1574  
1575        procedure Reduce_consecutive_max_lengths (m : in out Matches_Type) is
1576        --  Sometimes the BT4 algo returns a long list with consecutive lengths.
1577        --  We try to reduce it, if there is a clear advantage with distances.
1578        begin
1579          while m.count > 1
1580            and then m.dl (m.count).length = m.dl (m.count - 1).length + 1
1581            and then Has_much_smaller_Distance (m.dl (m.count - 1).distance, m.dl (m.count).distance)
1582          loop
1583            m.count := m.count - 1;
1584          end loop;
1585        end Reduce_consecutive_max_lengths;
1586  
1587        procedure Show_Matches (m : Matches_Type; phase : String) is
1588        begin
1589          Ada.Text_IO.Put_Line (
1590            phase & " --- Matches: " & Integer'Image (m.count)
1591          );
1592          for i in 1 .. m.count loop
1593            Ada.Text_IO.Put_Line (
1594              "  Distance:" & Integer'Image (m.dl (i).distance) &
1595              ";  Length:" & Integer'Image (m.dl (i).length)
1596            );
1597          end loop;
1598        end Show_Matches;
1599        pragma Unreferenced (Show_Matches);
1600  
1601        matches : Matches_Array (0 .. 1);
1602        current_match_index : Prefetch_Index_Type := 0;
1603        match_trace : DLP_Array (1 .. Max_Length_any_Algo);
1604  
1605        procedure Get_Next_Symbol is
1606          new_ld, main : Distance_Length_Pair;
1607  
1608          --  This function is for debugging. The matches stored in the 'tree' array
1609          --  may be wrong if the variables cyclicPos, lzPos and readPos are not in sync.
1610          --  The issue seems to have been solved now (rev. 489).
1611          function Is_match_correct (shift : Natural) return Boolean is
1612          begin
1613            for i in reverse -1 + shift .. main.length - 2 + shift loop
1614              if buf (readPos - (main.distance) + i) /= buf (readPos + i) then
1615                return False;  --  Should not occur.
1616              end if;
1617            end loop;
1618            return True;
1619          end Is_match_correct;
1620  
1621          procedure Send_first_literal_of_match is
1622          begin
1623            Write_Literal (cur_literal);
1624            readAhead := readAhead - 1;
1625          end Send_first_literal_of_match;
1626  
1627          procedure Send_DL_code (distance, length : Integer) is
1628            found_repeat : Integer := rep_dist'First - 1;
1629            aux : Integer;
1630          begin
1631            Write_DL_Code (distance, length);
1632            readAhead := readAhead - length;
1633            if LZMA_friendly then
1634              --
1635              --  Manage the stack of recent distances in the same way the "MA" part of LZMA does.
1636              --
1637              for i in rep_dist'Range loop
1638                if distance = rep_dist (i) then
1639                  found_repeat := i;
1640                  exit;
1641                end if;
1642              end loop;
1643              if found_repeat >= rep_dist'First then
1644                --  Roll the stack of recent distances up to the item with index found_repeat,
1645                --  which becomes first. If found_repeat = rep_dist'First, no actual change occurs.
1646                aux := rep_dist (found_repeat);
1647                for i in reverse rep_dist'First + 1 .. found_repeat loop
1648                  rep_dist (i) := rep_dist (i - 1);
1649                end loop;
1650                rep_dist (rep_dist'First) := aux;
1651              else
1652                --  Shift the stack of recent distances; the new distance becomes the first item.
1653                for i in reverse rep_dist'First + 1 .. rep_dist'Last loop
1654                  rep_dist (i) := rep_dist (i - 1);
1655                end loop;
1656                rep_dist (0) := distance;
1657              end if;
1658            end if;
1659          end Send_DL_code;
1660  
1661          avail, limit : Integer;
1662          index_max_score : Positive;
1663          set_max_score : Prefetch_Index_Type;
1664          hurdle : constant := 40;
1665        begin
1666          --  Get the matches for the next byte unless readAhead indicates
1667          --  that we already got the new matches during the previous call
1668          --  to this procedure.
1669          if readAhead = -1 then
1670            Read_One_and_Get_Matches (matches (current_match_index));
1671          end if;
1672          --  @ if readPos not in buf.all'Range then
1673          --  @   Put("**** " & buf'Last'Img & keepSizeAfter'Img & readPos'Img & writePos'Img);
1674          --  @ end if;
1675          cur_literal := buf (readPos);
1676          --  Get the number of bytes available in the dictionary, but
1677          --  not more than the maximum match length. If there aren't
1678          --  enough bytes remaining to encode a match at all, return
1679          --  immediately to encode this byte as a literal.
1680          avail := Integer'Min (Get_Available, Look_Ahead);
1681          if avail < MATCH_LEN_MIN then
1682            --  Put("[a]");
1683            Send_first_literal_of_match;
1684            return;
1685          end if;
1686  
1687          if LZMA_friendly and then best_length_for_rep_dist >= Nice_Length then
1688            Skip (best_length_for_rep_dist - 1);
1689            --  Put_Line("[DL RA]");
1690            Send_DL_code (rep_dist (best_rep_dist_index), best_length_for_rep_dist);
1691            return;
1692          end if;
1693  
1694          main := (length => 1, distance => 1);
1695          if matches (current_match_index).count > 0 then
1696            main := matches (current_match_index).dl (matches (current_match_index).count);
1697            if main.length >= Nice_Length then
1698              pragma Assert (Is_match_correct (1));
1699              Skip (main.length - 1);
1700              --  Put_Line("[DL A]" & mainDist'Img & mainLen'Img);
1701              Send_DL_code (main.distance, main.length);
1702              return;
1703            end if;
1704            Reduce_consecutive_max_lengths (matches (current_match_index));
1705            if LZMA_friendly then
1706              Get_supplemental_Matches_from_Repeat_Matches (matches (current_match_index));
1707            end if;
1708            main := matches (current_match_index).dl (matches (current_match_index).count);
1709            --
1710            if main.length = MATCH_LEN_MIN and then main.distance > 128 then
1711              main.length := 1;
1712            end if;
1713          end if;
1714  
1715          if LZMA_friendly
1716            and then best_length_for_rep_dist > MATCH_LEN_MIN
1717            and then (best_length_for_rep_dist >= main.length
1718              or else (best_length_for_rep_dist >= main.length - 2 and then main.distance > 2 ** 9)
1719              or else (best_length_for_rep_dist >= main.length - 3 and then main.distance > 2 ** 15))
1720          then
1721            --  Shortcut: we choose the longest repeat match.
1722            Skip (best_length_for_rep_dist - 1);
1723            --  Put_Line("[DL RB]");
1724            Send_DL_code (rep_dist (best_rep_dist_index), best_length_for_rep_dist);
1725            return;
1726          end if;
1727  
1728          if main.length < MATCH_LEN_MIN or else avail <= MATCH_LEN_MIN then
1729            --  Put("[b]");
1730            Send_first_literal_of_match;
1731            return;
1732          end if;
1733  
1734          -------------------------------------------------------------------------
1735          --  Get the next match. Test if it is better than the current match.   --
1736          --  If so, encode the current byte as a literal.                       --
1737          -------------------------------------------------------------------------
1738          current_match_index := 1 - current_match_index;
1739          Read_One_and_Get_Matches (matches (current_match_index));
1740          --
1741          --  Show_Matches (matches (1 - current_match_index), "------ Old");
1742          --  Show_Matches (matches (current_match_index),     "       New");
1743          --
1744          if matches (current_match_index).count > 0 then
1745            new_ld := matches (current_match_index).dl (matches (current_match_index).count);  --  Longest new match
1746            if        (new_ld.length >= main.length + hurdle     and then new_ld.distance < main.distance)
1747              or else
1748                (new_ld.length =  main.length + hurdle + 1
1749                 and then not Has_much_smaller_Distance (main.distance, new_ld.distance))
1750              or else  new_ld.length >  main.length + hurdle + 1
1751              or else (new_ld.length >= main.length + hurdle - 1
1752                  and then main.length >= MATCH_LEN_MIN + 1
1753                  and then Has_much_smaller_Distance (new_ld.distance, main.distance))
1754            then
1755              --  We prefer literal, then the new match (or even better!)
1756              Send_first_literal_of_match;
1757              return;
1758            end if;
1759            --
1760            --  Here we compare the scores of both match sets.
1761            --
1762            Reduce_consecutive_max_lengths (matches (current_match_index));
1763            if LZMA_friendly then
1764              Get_supplemental_Matches_from_Repeat_Matches (matches (current_match_index));
1765            end if;
1766            Estimate_DL_Codes (
1767              matches, 1 - current_match_index, (1 => cur_literal),
1768              index_max_score, set_max_score, match_trace
1769            );
1770            if set_max_score = 1 - current_match_index then
1771              --  Old match is seems better.
1772              main :=  matches (set_max_score).dl (index_max_score);
1773            else
1774              --  We prefer at least a literal, then a new, better match.
1775              Send_first_literal_of_match;
1776              return;
1777            end if;
1778          end if;
1779  
1780          if LZMA_friendly then
1781            limit := Integer'Max (main.length - 1, MATCH_LEN_MIN);
1782            for rep in rep_dist'Range loop
1783              if Compute_Match_Length (rep_dist (rep), limit) = limit then
1784                --  A "literal then DL_Code (some distance, main.length - 1)" match
1785                --  is verified and could use the stack of last distances -> got for it!
1786                Send_first_literal_of_match;
1787                return;
1788              end if;
1789            end loop;
1790          end if;
1791  
1792          pragma Assert (Is_match_correct (0));
1793          Skip (main.length - 2);
1794          --  Put_Line("[DL B]" & mainDist'Img & mainLen'Img);
1795          Send_DL_code (main.distance, main.length);
1796        end Get_Next_Symbol;
1797  
1798        procedure Deallocation is
1799        begin
1800          Dispose (buf);
1801          Dispose (tree);
1802          Dispose (Hash234.hash4Table);
1803        end Deallocation;
1804  
1805        actual_written, avail : Integer;
1806      begin
1807        --  NB: heap allocation used only for convenience because of
1808        --      the small default stack sizes on some compilers.
1809        buf := new Byte_Array (0 .. getBufSize);
1810        --
1811        actual_written := Fill_Window (String_buffer_size);
1812        if actual_written > 0 then
1813          loop
1814            Get_Next_Symbol;
1815            avail := Get_Available;
1816            if avail = 0 then
1817              actual_written := Fill_Window (String_buffer_size);
1818              exit when actual_written = 0;
1819            end if;
1820          end loop;
1821        end if;
1822        Deallocation;
1823      exception
1824        when others =>
1825          Deallocation;
1826          raise;
1827      end LZ77_using_BT4;
1828  
1829      procedure LZ77_by_Rich is
1830        --  * PROG2.C [lz77a.c]                                             *
1831        --  * Simple Hashing LZ77 Sliding Dictionary Compression Program    *
1832        --  * By Rich Geldreich, Jr. October, 1993                          *
1833        --  * Originally compiled with QuickC v2.5 in the small model.      *
1834        --  * This program uses more efficient code to delete strings from  *
1835        --  * the sliding dictionary compared to PROG1.C, at the expense of *
1836        --  * greater memory requirements. See the HashData and DeleteData  *
1837        --  * subroutines.                                                  *
1838        --
1839        --  Comments by GdM, 2019+ appear in square brackets: [...]
1840  
1841        --  Set this to True for a greedy encoder.
1842        GREEDY : constant Boolean := False;  --  [original: False]
1843  
1844        --  Ratio vs. speed constant [ Is it really a ratio? ].
1845        --  The larger this constant, the better the compression.
1846        MAXCOMPARES : constant := 4096;  --  [original: 75; good: 2400; from Info-Zip: 4096]
1847  
1848        --  Unused entry code.
1849        NIL : constant := 16#FFFF#;
1850  
1851        --  /* bits per symbol- normally 8 for general purpose compression */
1852        --  #define CHARBITS : constant := 8;  [ NB: dictionary uses char (byte) ]
1853  
1854        --  Minimum match length & maximum match length.
1855        THRESHOLD_Rich : constant := 2;
1856        MATCHBITS      : constant := 8;  --  [original: 4]
1857        --  [original: 2 ** MATCHBITS + THRESHOLD - 1]
1858        MAXMATCH  : constant := 2 ** MATCHBITS + THRESHOLD_Rich;  -- 258 is Deflate-friendly.
1859  
1860        --  Sliding dictionary size and hash table's size.
1861        --  Some combinations of HASHBITS and THRESHOLD values will not work
1862        --  correctly because of the way this program hashes strings.
1863  
1864        DICTBITS : constant := 15;  --  [original: 13]
1865        HASHBITS : constant := 13;  --  [original: 10]
1866        --
1867        DICTSIZE : constant := 2 ** DICTBITS;
1868        HASHSIZE : constant := 2 ** HASHBITS;
1869  
1870        --  # bits to shift after each XOR hash
1871        --  This constant must be high enough so that only THRESHOLD + 1
1872        --  characters are in the hash accumulator at one time.
1873  
1874        SHIFTBITS : constant := ((HASHBITS + THRESHOLD_Rich) / (THRESHOLD_Rich + 1));
1875  
1876        --  Sector size constants [the dictionary is partitoned in sectors].
1877  
1878        SECTORBIT : constant := 13;  --  [original: 10; OK: 13]
1879        SECTORLEN : constant := 2 ** SECTORBIT;
1880  
1881        HASH_MASK_1 : constant := 16#8000#;  --  [ was called HASHFLAG1 ]
1882        HASH_MASK_2 : constant := 16#7FFF#;  --  [ was called HASHFLAG2 ]
1883  
1884        --  Dictionary plus MAXMATCH extra chars for string comparisions.
1885        dict : array (Integer_M32'(0) .. DICTSIZE + MAXMATCH - 1) of Byte;
1886  
1887        subtype Unsigned_int is Unsigned_16;
1888  
1889        --  Hash table & link list tables.
1890  
1891        --  [ So far we index the hash table with Integer (minimum 16 bit signed) ]
1892        hash       : array (0 .. HASHSIZE - 1) of Unsigned_int := (others => NIL);
1893        --  [ nextlink: in lz77a.c: only through DICTSIZE - 1,
1894        --    although Init has: nextlink[DICTSIZE] = NIL. In doubt we set the
1895        --    'Last to DICTSIZE and fill everything with NIL... ]
1896        nextlink   : array (Integer_M32'(0) .. DICTSIZE)     of Unsigned_int := (others => NIL);
1897        lastlink   : array (Integer_M32'(0) .. DICTSIZE - 1) of Unsigned_int := (others => NIL);
1898  
1899        --  Loads dictionary with characters from the input stream.
1900        --
1901        procedure Load_Dict (dictpos : Integer_M32; actually_read : out Integer_M32) is
1902          i : Integer_M32 := 0;
1903        begin
1904          while More_Bytes loop
1905            dict (dictpos + i) := Read_Byte;
1906            i := i + 1;
1907            exit when i = SECTORLEN;
1908          end loop;
1909  
1910          --  Since the dictionary is a ring buffer, copy the characters at
1911          --  the very start of the dictionary to the end
1912          --  [this avoids having to use an "and" or a "mod" operator when searching].
1913          --
1914          if dictpos = 0 then
1915            for j in Integer_M32'(0) .. MAXMATCH - 1 loop
1916              dict (j + DICTSIZE) := dict (j);
1917            end loop;
1918          end if;
1919  
1920          actually_read := i;
1921        end Load_Dict;
1922  
1923        --  Deletes data from the dictionary search structures
1924        --  This is only done when the number of bytes to be
1925        --  compressed exceeds the dictionary's size.
1926        --
1927        procedure Delete_Data (dictpos : Integer_M32) is
1928          j, k : Integer_M32;
1929        begin
1930          --  Delete all references to the sector being deleted.
1931          k := dictpos + SECTORLEN;
1932          for i in dictpos .. k - 1 loop
1933            j := Integer_M32 (lastlink (i));
1934            if (Unsigned_int (j) and HASH_MASK_1) /= 0 then
1935              if j /= NIL then
1936                hash (Integer (Unsigned_int (j) and HASH_MASK_2)) := NIL;
1937              end if;
1938            else
1939              nextlink (j) := NIL;
1940            end if;
1941          end loop;
1942        end Delete_Data;
1943  
1944        --  Hash data just entered into dictionary.
1945        --  XOR hashing is used here, but practically any hash function will work.
1946        --
1947        procedure Hash_Data (dictpos, bytestodo : Integer_M32) is
1948          j : Integer;
1949          k : Integer_M32;
1950        begin
1951          if bytestodo <= THRESHOLD_Rich then  -- Not enough bytes in sector for match?
1952            nextlink (dictpos .. dictpos + bytestodo - 1) := (others => NIL);
1953            lastlink (dictpos .. dictpos + bytestodo - 1) := (others => NIL);
1954          else
1955            --  Matches can't cross sector boundaries.
1956            nextlink (dictpos + bytestodo - THRESHOLD_Rich .. dictpos + bytestodo - 1) := (others => NIL);
1957            lastlink (dictpos + bytestodo - THRESHOLD_Rich .. dictpos + bytestodo - 1) := (others => NIL);
1958  
1959            j :=  Integer (
1960                    Shift_Left (Unsigned_int (dict (dictpos)), SHIFTBITS)
1961                    xor
1962                    Unsigned_int (dict (dictpos + 1))
1963                  );
1964  
1965            k := dictpos + bytestodo - THRESHOLD_Rich;  --  Calculate end of sector.
1966  
1967            for i in dictpos ..  k - 1 loop
1968              j := Integer (
1969                    (Shift_Left (Unsigned_int (j), SHIFTBITS) and (HASHSIZE - 1))
1970                     xor
1971                     Unsigned_int (dict (i + THRESHOLD_Rich))
1972                   );
1973              lastlink (i) := Unsigned_int (j) or HASH_MASK_1;
1974              nextlink (i) := hash (j);
1975              if nextlink (i) /= NIL then
1976                lastlink (Integer_M32 (nextlink (i))) := Unsigned_int (i);
1977              end if;
1978              hash (j) := Unsigned_int (i);
1979            end loop;
1980          end if;
1981        end Hash_Data;
1982  
1983        matchlength, matchpos : Integer_M32;
1984  
1985        --  Finds match for string at position dictpos.
1986        --  This search code finds the longest AND closest
1987        --  match for the string at dictpos.
1988        --
1989        procedure Find_Match (dictpos, startlen : Integer_M32) is
1990          i, j : Integer_M32;
1991          match_byte : Byte;
1992        begin
1993          i := dictpos;
1994          matchlength := startlen;
1995          match_byte := dict (dictpos + matchlength);
1996          --
1997          Chances :
1998          for compare_count in 1 .. MAXCOMPARES loop
1999            i := Integer_M32 (nextlink (i));  --  Get next string in list.
2000            if i = NIL then
2001              return;
2002            end if;
2003            --
2004            if dict (i + matchlength) = match_byte then  --  Possible larger match?
2005              j := 0;
2006              --  Compare strings.
2007              loop
2008                exit when dict (dictpos + j) /= dict (i + j);
2009                j := j + 1;
2010                exit when j = MAXMATCH;
2011              end loop;
2012              --
2013              if j > matchlength then  --  Found larger match?
2014                matchlength := j;
2015                matchpos    := i;
2016                if matchlength = MAXMATCH then
2017                  return;  --  Exit if largest possible match.
2018                end if;
2019                match_byte := dict (dictpos + matchlength);
2020              end if;
2021            end if;
2022          end loop Chances;  --  Keep on trying until we run out of chances.
2023        end Find_Match;
2024  
2025        --  Finds dictionary matches for characters in current sector.
2026        --
2027        procedure Dict_Search (dictpos, bytestodo : Integer_M32) is
2028          i, j : Integer_M32;
2029          matchlen1, matchpos1 : Integer_M32;
2030          --
2031          procedure Write_literal_pos_i is
2032          pragma Inline (Write_literal_pos_i);
2033          begin
2034            Write_Literal (dict (i));
2035            i := i + 1;
2036            j := j - 1;
2037          end Write_literal_pos_i;
2038        begin
2039          i := dictpos;
2040          j := bytestodo;
2041  
2042          if not GREEDY then  --  Non-greedy search loop (slow).
2043  
2044            while j /= 0 loop  --  Loop while there are still characters left to be compressed.
2045              Find_Match (i, THRESHOLD_Rich);
2046  
2047              if matchlength > THRESHOLD_Rich then
2048                matchlen1 := matchlength;
2049                matchpos1 := matchpos;
2050  
2051                loop
2052                  Find_Match (i + 1, matchlen1);
2053  
2054                  if matchlength > matchlen1 then
2055                    matchlen1 := matchlength;
2056                    matchpos1 := matchpos;
2057                    Write_literal_pos_i;
2058                  else
2059                    if matchlen1 > j then
2060                      matchlen1 := j;
2061                      if matchlen1 <= THRESHOLD_Rich then
2062                        Write_literal_pos_i;
2063                        exit;
2064                      end if;
2065                    end if;
2066  
2067                    Write_DL_Code (
2068                      length   => Integer (matchlen1),
2069                      --  [The subtraction happens modulo 2**n, needs to be cleaned modulo 2**DICTSIZE]
2070                      distance => Integer ((Unsigned_32 (i) - Unsigned_32 (matchpos1)) and (DICTSIZE - 1))
2071                    );
2072                    i := i + matchlen1;
2073                    j := j - matchlen1;
2074                    exit;
2075                  end if;
2076                end loop;
2077  
2078              else
2079                Write_literal_pos_i;
2080              end if;
2081  
2082            end loop;  --  while j /= 0
2083  
2084          else  --  Greedy search loop (fast).
2085  
2086            while j /= 0 loop  --  Loop while there are still characters left to be compressed.
2087  
2088              Find_Match (i, THRESHOLD_Rich);
2089  
2090              if matchlength > j then
2091                matchlength := j;     --  Clamp matchlength.
2092              end if;
2093  
2094              if matchlength > THRESHOLD_Rich then  --  Valid match?
2095                Write_DL_Code (
2096                  length   => Integer (matchlength),
2097                  --  [The subtraction happens modulo 2**n, needs to be cleaned modulo 2**DICTSIZE]
2098                  distance => Integer ((Unsigned_32 (i) - Unsigned_32 (matchpos)) and (DICTSIZE - 1))
2099                );
2100                i := i + matchlength;
2101                j := j - matchlength;
2102              else
2103                Write_literal_pos_i;
2104              end if;
2105            end loop;
2106  
2107          end if;  --  Greedy or not.
2108  
2109        end Dict_Search;
2110  
2111        procedure Encode_Rich is
2112          dictpos, actual_read : Integer_M32 :=  0;
2113          deleteflag : Boolean := False;
2114        begin
2115          loop
2116            --  Delete old data from dictionary.
2117            if deleteflag then
2118              Delete_Data (dictpos);
2119            end if;
2120  
2121            --  Grab more data to compress.
2122            Load_Dict (dictpos, actual_read);
2123            exit when actual_read = 0;
2124  
2125            --  Hash the data.
2126            Hash_Data (dictpos, actual_read);
2127  
2128            --  Find dictionary matches.
2129            Dict_Search (dictpos, actual_read);
2130  
2131            dictpos := dictpos + SECTORLEN;
2132  
2133            --  Wrap back to beginning of dictionary when it's full.
2134            if dictpos = DICTSIZE then
2135              dictpos := 0;
2136              deleteflag := True;   --  Ok to delete now.
2137            end if;
2138          end loop;
2139        end Encode_Rich;
2140  
2141      begin
2142        Encode_Rich;
2143      end LZ77_by_Rich;
2144  
2145      --  The following is for research purposes: compare different LZ77
2146      --  algorithms applied to entropy encoders (Deflate, LZMA, ...).
2147  
2148      procedure LZ77_from_Dump_File is
2149        LZ77_Dump : Ada.Text_IO.File_Type;
2150        tag : String (1 .. 3);
2151        Wrong_LZ77_tag : exception;
2152        a, b : Integer;
2153        dummy : Byte;
2154        use Ada.Integer_Text_IO;
2155      begin
2156        --  Pretend we compress the given stream.
2157        --  Entire stream is consumed here.
2158        while More_Bytes loop
2159          dummy := Read_Byte;
2160        end loop;
2161        --  Now send dumped LZ77 data further.
2162        Ada.Text_IO.Open (LZ77_Dump, Ada.Text_IO.In_File, "dump.lz77");
2163        --  File from UnZip.Decompress, or LZMA.Decoding, some_trace = True mode
2164        while not Ada.Text_IO.End_Of_File (LZ77_Dump) loop
2165          Ada.Text_IO.Get (LZ77_Dump, tag);
2166          if tag = "Lit" then
2167            Get (LZ77_Dump, a);
2168            Write_Literal (Byte (a));
2169          elsif tag = "DLE" then
2170            Get (LZ77_Dump, a);
2171            Get (LZ77_Dump, b);
2172            Write_DL_Code (a, b);
2173          else
2174            raise Wrong_LZ77_tag;
2175          end if;
2176          Ada.Text_IO.Skip_Line (LZ77_Dump);
2177        end loop;
2178        Ada.Text_IO.Close (LZ77_Dump);
2179      end LZ77_from_Dump_File;
2180  
2181    begin
2182      case Method is
2183        when LZHuf =>
2184          LZ77_using_LZHuf;
2185        when IZ_4 .. IZ_10 =>
2186          LZ77_using_IZ (4 + Method_Type'Pos (Method) -  Method_Type'Pos (IZ_4));
2187        when BT4 =>
2188          LZ77_using_BT4;
2189        when Rich =>
2190          LZ77_by_Rich;
2191        when No_LZ77 =>
2192          while More_Bytes loop
2193            Write_Literal (Read_Byte);
2194          end loop;
2195        when Read_LZ77_Codes =>
2196          LZ77_from_Dump_File;
2197      end case;
2198    end Encode;
2199  
2200  end LZ77;

Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other Ada projects on Gautier's blog.