Back to... Zip-Ada

Source file : lzma-encoding.adb



   1  --  LZMA.Encoding - a standalone, generic LZMA encoder.
   2  --  Author: G. de Montmollin (except parts mentioned below (*)).
   3  --
   4  --  This encoder was built mostly by mirroring from LZMA.Decoding upon
   5  --  the format's symmetries between encoding and decoding. For instance,
   6  --
   7  --      Bit_Tree_Decode(probs_len.low_coder(pos_state), Len_low_bits, len);
   8  --  becomes:
   9  --      Bit_Tree_Encode(probs_len.low_coder(pos_state), Len_low_bits, len);
  10  --
  11  --  Furthermore, cases for which there are alternatives are decided by comparing
  12  --  their respective probabilities (search "MProb" in the code).
  13  --
  14  --  (*) The base mechanism (the encoding of range, literals and DL codes)
  15  --      is from the original LzmaEnc.c by Igor Pavlov.
  16  --      The Get_dist_slot function is from the LZMAEncoder.java by Lasse Collin.
  17  
  18  --  Legal licensing note:
  19  
  20  --  Copyright (c) 2016 .. 2020 Gautier de Montmollin
  21  --  SWITZERLAND
  22  
  23  --  Permission is hereby granted, free of charge, to any person obtaining a copy
  24  --  of this software and associated documentation files (the "Software"), to deal
  25  --  in the Software without restriction, including without limitation the rights
  26  --  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  27  --  copies of the Software, and to permit persons to whom the Software is
  28  --  furnished to do so, subject to the following conditions:
  29  
  30  --  The above copyright notice and this permission notice shall be included in
  31  --  all copies or substantial portions of the Software.
  32  
  33  --  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  34  --  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  35  --  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  36  --  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  37  --  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  38  --  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  39  --  THE SOFTWARE.
  40  
  41  --  NB: this is the MIT License, as found on the site
  42  --  http://www.opensource.org/licenses/mit-license.php
  43  
  44  --
  45  --  Change log:
  46  --------------
  47  --
  48  --  18-Aug-2016: Fully functional.
  49  --  28-Jul-2016: Created.
  50  
  51  with LZ77;
  52  
  53  with Ada.Unchecked_Deallocation;
  54  
  55  package body LZMA.Encoding is
  56  
  57    use type Data_Bytes_Count;
  58  
  59    procedure Encode
  60      (level                  : Compression_Level           := Level_1;
  61       literal_context_bits   : Literal_Context_Bits_Range  := 3;   --  Bits of last byte are used.
  62       literal_position_bits  : Literal_Position_Bits_Range := 0;   --  Position mod 2**bits is used.
  63       position_bits          : Position_Bits_Range         := 2;   --  Position mod 2**bits is used.
  64       end_marker             : Boolean := True;   --  Produce an End-Of-Stream marker ?
  65       uncompressed_size_info : Boolean := False;  --  Optional extra header needed for .lzma files.
  66                                                   --  In LZMA.Decoding, type LZMA_Hints: has_size.
  67       dictionary_size        : Natural := Default_dictionary_size)  --  Not used by Level_1, Level_2.
  68    is
  69  
  70      --  Gets an integer [0, 63] matching the highest two bits of an integer.
  71      --  It is a log2 function with one "decimal".
  72      --
  73      function Get_dist_slot (dist : UInt32) return Unsigned is
  74        n : UInt32;
  75        i : Natural;
  76      begin
  77        if dist <= Start_dist_model_index then
  78          return Unsigned (dist);
  79        end if;
  80        n := dist;
  81        i := 31;
  82        if (n and 16#FFFF_0000#) = 0 then
  83          n := Shift_Left (n, 16);
  84          i := 15;
  85        end if;
  86        if (n and 16#FF00_0000#) = 0 then
  87          n := Shift_Left (n, 8);
  88          i := i - 8;
  89        end if;
  90        if (n and 16#F000_0000#) = 0 then
  91          n := Shift_Left (n, 4);
  92          i := i - 4;
  93        end if;
  94        if (n and 16#C000_0000#) = 0 then
  95          n := Shift_Left (n, 2);
  96          i := i - 2;
  97        end if;
  98        if (n and 16#8000_0000#) = 0 then
  99          i := i - 1;
 100        end if;
 101        return Unsigned (i * 2) + Unsigned (Shift_Right (dist, i - 1) and 1);
 102      end Get_dist_slot;
 103  
 104      --  Round to the next power of two. BT4 borks without this for the window size.
 105      function Ceiling_power_of_2 (x : Natural) return Positive is
 106        p : Positive := 1;
 107      begin
 108        while p < Integer'Last / 2 and p < x loop
 109          p := p * 2;
 110        end loop;
 111        return Integer'Max (p, x);
 112      end Ceiling_power_of_2;
 113  
 114      -----------------------------------
 115      --  LZ77 compression parameters  --
 116      -----------------------------------
 117  
 118      LZ77_choice : constant array (Compression_Level) of LZ77.Method_Type :=
 119        (Level_0   => LZ77.No_LZ77,  --  We don't do any LZ77 for level 0
 120         Level_1   => LZ77.IZ_6,
 121         Level_2   => LZ77.IZ_10,
 122         Level_3   => LZ77.BT4);
 123  
 124      Min_length : constant array (Compression_Level) of Positive :=
 125        (Level_1 | Level_2  => 3,     --  Deflate's minimum value
 126         others             => 2);    --  LZMA's minimum value
 127  
 128      Max_length : constant array (Compression_Level) of Positive :=
 129        (Level_1 | Level_2  => 258,   --  Deflate's maximum value
 130         others             => 273);  --  LZMA's maximum value
 131  
 132      extra_size : constant := 273 + 1 + LZ77.BT4_max_prefetch_positions;
 133      --  Extra space is used for DL codes scoring before being
 134      --  sent for real to the encoder.
 135  
 136      --  String_buffer_size: the actual dictionary size used.
 137      String_buffer_size : constant array (Compression_Level) of Positive :=
 138        (Level_0            => 16,       --  Fake: actually we don't use any LZ77 for level 0
 139         Level_1 | Level_2  => 2 ** 15,  --  Deflate's Value: 32 KiB
 140         Level_3            =>
 141           Integer'Max (
 142             Min_dictionary_size,                --  minimum:  4 KiB
 143             Integer'Min (
 144               --    dictionary_size is specified; default is 32 KiB
 145               Ceiling_power_of_2 (dictionary_size + extra_size),
 146               2 ** 28                          --  maximum: 256 MiB
 147             )
 148           )
 149        );
 150  
 151      -----------------------------------------------------------
 152      --  The LZMA "machine": here the LZ codes are processed  --
 153      --  and sent to the above bit encoder in a smart way.    --
 154      -----------------------------------------------------------
 155  
 156      type LZMA_Params_Info is record
 157        unpack_size          : Data_Bytes_Count := 0;
 158        --  unpack_size_defined is always False in this implementation:
 159        --  size is not known in advance and the header cannot be
 160        --  rewritten when processing is done.
 161        unpack_size_defined  : Boolean := False;
 162        header_has_size      : Boolean := uncompressed_size_info;
 163        has_end_mark         : Boolean := end_marker;
 164        dict_size            : UInt32  := UInt32 (String_buffer_size (level));
 165        lc                   : Literal_Context_Bits_Range  := literal_context_bits;
 166        lp                   : Literal_Position_Bits_Range := literal_position_bits;
 167        pb                   : Position_Bits_Range         := position_bits;
 168      end record;
 169  
 170      params : LZMA_Params_Info;
 171  
 172      --  Small stack of recent distances used for LZ. Required: initialized with zero values.
 173      --  lzma-specification.txt: "That set of 4 variables contains zero-based match
 174      --  distances and these variables are initialized with zero values"
 175      --
 176      subtype Repeat_stack_range is Integer range 0 .. 3;
 177      type Repeat_Stack is array (Repeat_stack_range) of UInt32;
 178      --
 179      probs : All_probabilities (last_lit_prob_index => 16#300# * 2 ** (params.lc + params.lp) - 1);
 180      pos_bits_mask    : constant UInt32 := 2 ** params.pb - 1;
 181      literal_pos_mask : constant UInt32 := 2 ** params.lp - 1;
 182  
 183      --  We expand the DL codes in order to have some past data.
 184      subtype Text_Buffer_Index is UInt32 range 0 .. UInt32 (String_buffer_size (level) - 1);
 185      type Text_Buffer is array (Text_Buffer_Index) of Byte;
 186      Text_Buf_Mask : constant UInt32 := UInt32 (String_buffer_size (level) - 1);
 187      --  NB: heap allocation (and then, the only pointer in this package) is used
 188      --      only for convenience because of small default stack sizes on some compilers.
 189      type p_Text_Buffer is access Text_Buffer;
 190      procedure Dispose is new Ada.Unchecked_Deallocation (Text_Buffer, p_Text_Buffer);
 191      Text_Buf : p_Text_Buffer := new Text_Buffer;
 192  
 193      function Idx_for_Literal_prob (position : Data_Bytes_Count; prev_byte : Byte) return Integer is
 194      pragma Inline (Idx_for_Literal_prob);
 195      begin
 196        return 16#300# *
 197            Integer (
 198              Shift_Left (UInt32 (position) and literal_pos_mask, params.lc) +
 199              Shift_Right (UInt32 (prev_byte), 8 - params.lc)
 200            );
 201      end Idx_for_Literal_prob;
 202  
 203      type Variants_Comparison_Choice is
 204        (
 205          None,      --  "Mechanical" encoding, straight from the LZ77 algorithm.
 206          Simple,    --  Compare simple alternative encodings and choose the most probable.
 207          Splitting  --  More advanced search for alternatives.
 208        );
 209  
 210      compare_variants : Variants_Comparison_Choice;
 211  
 212      type Machine_State is record
 213        state     : State_range;
 214        pos_state : Pos_state_range;
 215        prev_byte : Byte;
 216        R         : UInt32;
 217        total_pos : Data_Bytes_Count;
 218        rep_dist  : Repeat_Stack;
 219      end record;
 220  
 221      -------------------------
 222      --  Package Estimates  --
 223      -------------------------
 224      --
 225      --  Purpose: estimate probabilities of different alternative
 226      --  encodings, in order to choose the most probable encoding.
 227      --  Note that the LZMA encoder is already very efficient by
 228      --  taking the obvious choices. It is possible to ignore this
 229      --  package and its uses (see occurrences of "compare_variants").
 230      --
 231      --  In the following probability computations, we assume independent
 232      --  (multiplicative) probabilities, just like the range encoder does
 233      --  when adapting the range width. With higher probabilities, the width
 234      --  will decrease less and the compression will be better.
 235      --  Since the probability model is constantly adapting, we have kind of self-fulfilling
 236      --  predictions - e.g. if a Short Rep Match is chosen against a Literal, the context
 237      --  probabilities of the former will be increased instead of the latter.
 238  
 239      package Estimates is
 240        type MProb is digits 15 range 0.0 .. 1.0;
 241        --
 242        --  Literals
 243        --
 244        procedure Simulate_Literal_Byte (b : Byte; sim : in out Machine_State; prob : in out MProb);
 245        --
 246        function Test_Simple_Literal (
 247          b, b_match : Byte;
 248          prob       : CProb_array;
 249          sim        : Machine_State
 250        )
 251        return MProb;
 252        --
 253        function Test_Short_Rep_Match (sim : Machine_State) return MProb;
 254        --
 255        function Test_Literal_Byte (b : Byte; sim : Machine_State) return MProb;
 256        --
 257        --  Matches
 258        --
 259        function Test_Repeat_Match (
 260          index_rm : Repeat_stack_range;
 261          length   : Unsigned;
 262          sim      : Machine_State
 263        )
 264        return MProb;
 265        --
 266        function Test_Simple_Match (
 267          distance : UInt32;
 268          length   : Unsigned;
 269          sim      : Machine_State
 270        )
 271        return MProb;
 272  
 273        --  End of the obvious cases. Now things get tougher...
 274  
 275        --  Here we get the probability of a general DL code
 276        --  as Write_any_DL_code would generate it, including variants.
 277  
 278        procedure Simulate_any_DL_Code (
 279          distance        :        UInt32;
 280          length          :        Match_length_range;
 281          sim             : in out Machine_State;
 282          prob            : in out MProb;
 283          recursion_limit :        Natural
 284        );
 285  
 286        function Test_any_DL_Code (
 287          distance        : UInt32;
 288          length          : Match_length_range;
 289          sim             : Machine_State;
 290          recursion_limit : Natural
 291        )
 292        return MProb;
 293  
 294        --  Constants like 0.1234 appearing hereafter are empirical, tuned, magic numbers.
 295        --  To do: tune them with Machine Learning.
 296  
 297        --  Sometimes, a longer path like sending a simple match
 298        --  instead of a repeat match has a lower modelled probability.
 299        --  To avoid underusing repeat matches by letting their probabilities
 300        --  being adapted lower over time, we penalize the simple match alternative.
 301        Malus_simple_match_vs_rep : constant := 0.55;
 302  
 303        package DL_Code_Erosion is
 304          --  It is sometimes better to split a DL code as a very frequent literal,
 305          --  then a very frequent DL code with length-1.
 306          Lit_then_DL_threshold : constant := 0.875;
 307          --
 308          function Malus_lit_then_DL (distance : UInt32; length : Match_length_range) return MProb;
 309          pragma Inline (Malus_lit_then_DL);
 310          --  Case of DL code split into a shorter DL code, then a literal.
 311          function DL_code_then_Literal (
 312            distance        : UInt32;
 313            length          : Match_length_range;
 314            sim             : Machine_State;
 315            recursion_limit : Natural
 316          )
 317          return MProb;
 318        end DL_Code_Erosion;
 319  
 320        --  Here we define a generic DL code emission that is the same
 321        --  for simulation and actual writes. This way, we don't need to
 322        --  synchronize two pieces of Ada code doing the same operation,
 323        --  one in simulation and the other in real.
 324        --
 325        generic
 326          with procedure Simulated_or_actual_Literal_Byte (
 327            b    :        Byte;
 328            sim  : in out Machine_State;
 329            prob : in out MProb);
 330          --
 331          with procedure Simulated_or_actual_Strict_DL_Code (
 332            distance :        UInt32;
 333            length   :        Match_length_range;
 334            sim      : in out Machine_State;
 335            prob     : in out MProb
 336          );
 337          --
 338          I_am_a_simulation : Boolean;
 339        procedure Generic_any_DL_Code (
 340          distance        :        UInt32;
 341          length          :        Match_length_range;
 342          sim             : in out Machine_State;
 343          prob            : in out MProb;
 344          recursion_limit :        Natural
 345        );
 346  
 347      end Estimates;
 348  
 349      package body Estimates is
 350        To_Prob_Factor : constant  --  We compute the division (more expensive) at compile-time.
 351          MProb := 1.0 / MProb'Base (probability_model_count);
 352  
 353        function To_Math (cp : CProb) return MProb is
 354        pragma Inline (To_Math);
 355        begin
 356          return MProb'Base (cp) * To_Prob_Factor;
 357        end To_Math;
 358  
 359        function Test_Bit_Encoding (prob_bit : CProb; symbol : Unsigned) return MProb is
 360        pragma Inline (Test_Bit_Encoding);
 361          b : constant MProb'Base := MProb'Base (symbol);  --  b = 0.0 or 1.0
 362        begin
 363          return b + (1.0 - 2.0 * b) * To_Math (prob_bit);
 364          --  Branch-less equivalent of:
 365          --    if bit = 0 then
 366          --      return prob_bit;
 367          --    else
 368          --      return 1.0 - prob_bit;
 369          --    end if;
 370        end Test_Bit_Encoding;
 371  
 372        function Test_Simple_Literal (
 373          b, b_match    : Byte;
 374          prob          : CProb_array;
 375          sim           : Machine_State
 376        ) return MProb
 377        is
 378          prob_lit : MProb := Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), Literal_choice);
 379          symb : UInt32 := UInt32 (b) or 16#100#;
 380          --
 381          procedure Simulate_Literal is
 382          begin
 383            loop
 384              prob_lit := prob_lit *
 385                Test_Bit_Encoding (
 386                  prob_bit => prob (Integer (Shift_Right (symb, 8)) + prob'First),
 387                  symbol   => Unsigned (Shift_Right (symb, 7)) and 1
 388                );
 389              symb := Shift_Left (symb, 1);
 390              exit when symb >= 16#10000#;
 391            end loop;
 392          end Simulate_Literal;
 393          --
 394          procedure Simulate_Literal_Matched is
 395            offs  : UInt32 := 16#100#;
 396            match : UInt32 := UInt32 (b_match);
 397          begin
 398            loop
 399              match := Shift_Left (match, 1);
 400              prob_lit := prob_lit *
 401                Test_Bit_Encoding (
 402                  prob_bit => prob (Integer (offs + (match and offs) +
 403                                                   Shift_Right (symb, 8)) + prob'First),
 404                  symbol   => Unsigned (Shift_Right (symb, 7)) and 1
 405                );
 406              symb := Shift_Left (symb, 1);
 407              offs := offs and not (match xor symb);
 408              exit when symb >= 16#10000#;
 409            end loop;
 410          end Simulate_Literal_Matched;
 411          --
 412        begin
 413          if sim.state < 7 then
 414            Simulate_Literal;
 415          else
 416            Simulate_Literal_Matched;
 417          end if;
 418          return prob_lit;
 419        end Test_Simple_Literal;
 420  
 421        function Test_Short_Rep_Match (sim : Machine_State) return MProb is
 422        begin
 423          return
 424            Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), DL_code_choice) *
 425            Test_Bit_Encoding (probs.switch.rep (sim.state), Rep_match_choice) *
 426            Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_rep0_choice) *
 427            Test_Bit_Encoding (probs.switch.rep0_long (sim.state, sim.pos_state), The_length_is_1_choice);
 428        end Test_Short_Rep_Match;
 429  
 430        --  We simulate here LZ77_emits_literal_byte.
 431        procedure Simulate_Literal_Byte (b : Byte; sim : in out Machine_State; prob : in out MProb) is
 432          probs_lit_idx : constant Integer := Idx_for_Literal_prob (sim.total_pos, sim.prev_byte);
 433          ltr, srm : MProb;
 434          procedure Update_pos_related_stuff is
 435          begin
 436            sim.R := (sim.R + 1) and Text_Buf_Mask;
 437            sim.total_pos := sim.total_pos + 1;
 438            sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
 439            sim.prev_byte := b;
 440          end Update_pos_related_stuff;
 441          b_match : constant Byte := Text_Buf ((sim.R - sim.rep_dist (0) - 1) and Text_Buf_Mask);
 442        begin
 443          sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
 444          ltr := Test_Simple_Literal (b, b_match, probs.lit (probs_lit_idx .. probs.lit'Last), sim);
 445          if b = b_match and then sim.total_pos > Data_Bytes_Count (sim.rep_dist (0) + 1) then
 446            srm := Test_Short_Rep_Match (sim);
 447            if srm > ltr then
 448              --  Short Rep would be preferred.
 449              sim.state := Update_State_ShortRep (sim.state);
 450              prob := prob * srm;
 451              Update_pos_related_stuff;
 452              return;
 453            end if;
 454          end if;
 455          sim.state := Update_State_Literal (sim.state);
 456          prob := prob * ltr;
 457          Update_pos_related_stuff;
 458        end Simulate_Literal_Byte;
 459  
 460        function Test_Literal_Byte (b : Byte; sim : Machine_State) return MProb is
 461          --  The following variable is discarded after the simulation,
 462          --  since we only test the literal generation for getting its probability.
 463          sim_var : Machine_State := sim;
 464          prob : MProb := 1.0;
 465        begin
 466          Simulate_Literal_Byte (b, sim_var, prob);
 467          return prob;
 468        end Test_Literal_Byte;
 469  
 470        function Simulate_Bit_Tree (prob : CProb_array; num_bits : Positive; symbol : Unsigned) return MProb is
 471          res : MProb := 1.0;
 472          bit, m : Unsigned;
 473        begin
 474          m := 1;
 475          for i in reverse 0 .. num_bits - 1 loop
 476            bit := Unsigned (Shift_Right (UInt32 (symbol), i)) and 1;
 477            res := res * Test_Bit_Encoding (prob (Integer (m) + prob'First), bit);
 478            m := 2 * m + bit;
 479          end loop;
 480          return res;
 481        end Simulate_Bit_Tree;
 482  
 483        function Test_Length (
 484          probs_len     : Probs_for_LZ_Lengths;
 485          length        : Unsigned;
 486          sim_pos_state : Pos_state_range
 487        )
 488        return MProb
 489        is
 490          len : Unsigned := length - Min_match_length;
 491          res : MProb;
 492        begin
 493          if len < Len_low_symbols then
 494            res := Test_Bit_Encoding (probs_len.choice_1, 0) *
 495                   Simulate_Bit_Tree (probs_len.low_coder (sim_pos_state), Len_low_bits, len);
 496          else
 497            res := Test_Bit_Encoding (probs_len.choice_1, 1);
 498            len := len - Len_low_symbols;
 499            if len < Len_mid_symbols then
 500              res := res * Test_Bit_Encoding (probs_len.choice_2, 0)
 501                         * Simulate_Bit_Tree (probs_len.mid_coder (sim_pos_state), Len_mid_bits, len);
 502            else
 503              res := res * Test_Bit_Encoding (probs_len.choice_2, 1);
 504              len := len - Len_mid_symbols;
 505              res := res * Simulate_Bit_Tree (probs_len.high_coder, Len_high_bits, len);
 506            end if;
 507          end if;
 508          return res;
 509        end Test_Length;
 510  
 511        function Test_Repeat_Match (
 512          index_rm : Repeat_stack_range;
 513          length   : Unsigned;
 514          sim      : Machine_State
 515        )
 516        return MProb
 517        is
 518          res : MProb := Test_Bit_Encoding (probs.switch.rep (sim.state), Rep_match_choice);
 519        begin
 520          case index_rm is
 521            when 0 =>
 522              res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_rep0_choice)
 523                         * Test_Bit_Encoding
 524                             (probs.switch.rep0_long (sim.state, sim.pos_state), The_length_is_not_1_choice);
 525            when 1 =>
 526              res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
 527                         * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_rep1_choice);
 528            when 2 =>
 529              res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
 530                         * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_not_rep1_choice)
 531                         * Test_Bit_Encoding (probs.switch.rep_g2 (sim.state), The_distance_is_rep2_choice);
 532            when 3 =>
 533              res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
 534                         * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_not_rep1_choice)
 535                         * Test_Bit_Encoding (probs.switch.rep_g2 (sim.state), The_distance_is_not_rep2_choice);
 536          end case;
 537          return res * Test_Length (probs.rep_len, length, sim.pos_state);
 538        end Test_Repeat_Match;
 539  
 540        function Test_Simple_Match (
 541          distance      : UInt32;
 542          length        : Unsigned;
 543          sim           : Machine_State
 544        )
 545        return MProb
 546        is
 547          --
 548          function Simulate_Bit_Tree_Reverse (prob : CProb_array; num_bits : Natural; symbol : UInt32)
 549          return MProb
 550          is
 551            res : MProb := 1.0;
 552            symb : UInt32 := symbol;
 553            m : Unsigned := 1;
 554            bit : Unsigned;
 555          begin
 556            for count_bits in reverse 1 .. num_bits loop
 557              bit := Unsigned (symb) and 1;
 558              res := res * Test_Bit_Encoding (prob (Integer (m) + prob'First), bit);
 559              m := 2 * m + bit;
 560              symb := Shift_Right (symb, 1);
 561            end loop;
 562            return res;
 563          end Simulate_Bit_Tree_Reverse;
 564          --
 565          function Test_Distance return MProb is
 566            len_state : constant Unsigned := Unsigned'Min (length - 2, len_to_pos_states - 1);
 567            dist_slot : constant Unsigned := Get_dist_slot (distance);
 568            base, dist_reduced : UInt32;
 569            footerBits : Natural;
 570            res : MProb;
 571          begin
 572            res := Simulate_Bit_Tree (probs.dist.slot_coder (len_state), Dist_slot_bits, dist_slot);
 573            if dist_slot >= Start_dist_model_index then
 574              footerBits := Natural (Shift_Right (UInt32 (dist_slot), 1)) - 1;
 575              base := Shift_Left (UInt32 (2 or (dist_slot and 1)), footerBits);
 576              dist_reduced := distance - base;
 577              if dist_slot < End_dist_model_index then
 578                res := res *
 579                  Simulate_Bit_Tree_Reverse (
 580                    probs.dist.pos_coder (Integer (base) - Integer (dist_slot) - 1 .. Pos_coder_range'Last),
 581                    footerBits,
 582                    dist_reduced
 583                  );
 584              else
 585                res := res *
 586                  (0.5 ** (footerBits - align_bits)) *  --  direct bits
 587                  Simulate_Bit_Tree_Reverse (
 588                    probs.dist.align_coder,
 589                    align_bits,
 590                    dist_reduced and align_mask
 591                  );
 592              end if;
 593            end if;
 594            return res;
 595          end Test_Distance;
 596        begin
 597          return
 598            Test_Bit_Encoding (probs.switch.rep (sim.state), Simple_match_choice) *
 599            Test_Length (probs.len, length, sim.pos_state) *
 600            Test_Distance;
 601        end Test_Simple_Match;
 602  
 603        --  We simulate here a Distance-Length code
 604        --  sent straight to the encoder (no variants).
 605        procedure Simulate_Strict_DL_Code (
 606          distance      :        UInt32;
 607          length        :        Match_length_range;
 608          sim           : in out Machine_State;
 609          prob          : in out MProb
 610        )
 611        is
 612        pragma Inline (Simulate_Strict_DL_Code);
 613          dist_ip : constant UInt32 := UInt32 (distance - 1);  --  7-Zip distance convention (minus 1)
 614          found_repeat : Integer := Repeat_Stack'First - 1;
 615          dlc : constant MProb := Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), DL_code_choice);
 616          sma : constant MProb := Test_Simple_Match (dist_ip, Unsigned (length), sim);
 617          rma : MProb;
 618          aux : UInt32;
 619          procedure Update_pos_related_stuff is
 620          begin
 621            sim.total_pos := sim.total_pos + Data_Bytes_Count (length);
 622            sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
 623            sim.R := (sim.R + UInt32 (length)) and Text_Buf_Mask;  --  This is mod String_buffer_size
 624            sim.prev_byte := Text_Buf ((sim.R - 1) and Text_Buf_Mask);
 625          end Update_pos_related_stuff;
 626        begin
 627          for i in Repeat_Stack'Range loop
 628            if dist_ip = sim.rep_dist (i) then
 629              found_repeat := i;
 630              exit;
 631              --  NB: it's possible to pick the most probable duplicate instead, but without clear gain
 632            end if;
 633          end loop;
 634          if found_repeat >= Repeat_Stack'First then
 635            rma := Test_Repeat_Match (found_repeat, Unsigned (length), sim);
 636            if rma >= sma * Malus_simple_match_vs_rep  then
 637              --  Repeat match case:
 638              prob := prob * dlc * rma;
 639              --  Roll the stack of recent distances up to the found item, which becomes the first one.
 640              aux := sim.rep_dist (found_repeat);
 641              for i in reverse 1 .. found_repeat loop
 642                sim.rep_dist (i) := sim.rep_dist (i - 1);
 643              end loop;
 644              sim.rep_dist (0) := aux;
 645              sim.state := Update_State_Rep (sim.state);
 646              Update_pos_related_stuff;
 647              return;
 648            end if;
 649          end if;
 650          --  Simple match case:
 651          prob := prob * dlc * sma;
 652          --  Shift the stack of recent distances; the new distance becomes the first item.
 653          for i in reverse 1 .. Repeat_stack_range'Last loop
 654            sim.rep_dist (i) := sim.rep_dist (i - 1);
 655          end loop;
 656          sim.rep_dist (0) := dist_ip;  --  0-based distance.
 657          sim.state := Update_State_Match (sim.state);
 658          Update_pos_related_stuff;
 659        end Simulate_Strict_DL_Code;
 660  
 661        function Test_Strict_DL_Code (
 662          distance      : UInt32;
 663          length        : Match_length_range;
 664          sim           : Machine_State
 665        )
 666        return MProb
 667        is
 668        pragma Inline (Test_Strict_DL_Code);
 669          --  The following variable is discarded after the simulation,
 670          --  since we only test strict DL code for getting its probability.
 671          sim_var : Machine_State := sim;
 672          --
 673          prob : MProb := 1.0;
 674        begin
 675          Simulate_Strict_DL_Code (distance, length, sim_var, prob);
 676          return prob;
 677        end Test_Strict_DL_Code;
 678  
 679        --  Expand fully a DL code as a string of literals.
 680        procedure Simulate_Expand_DL_code (
 681          distance      :        UInt32;
 682          length        :        Match_length_range;
 683          give_up       :        MProb;
 684          sim           : in out Machine_State;
 685          prob          : in out MProb
 686        )
 687        is
 688        pragma Inline (Simulate_Expand_DL_code);
 689          b : Byte;
 690          --
 691          sim_mem : constant Machine_State := sim;
 692          expanded_string_prob : MProb := 1.0;
 693          Copy_start : constant UInt32 := (sim.R - distance) and Text_Buf_Mask;
 694        begin
 695          for x in 1 .. length loop
 696            b := Text_Buf ((Copy_start + UInt32 (x - 1)) and Text_Buf_Mask);
 697            Simulate_Literal_Byte (b, sim, expanded_string_prob);
 698            --  Probability is decreasing over the loop, so it is
 699            --  useless to continue under given threshold.
 700            if expanded_string_prob < give_up then
 701              sim := sim_mem;
 702              exit;
 703            end if;
 704            sim.prev_byte := b;
 705          end loop;
 706          prob := prob * expanded_string_prob;
 707        end Simulate_Expand_DL_code;
 708  
 709        function Test_Expanded_DL_Code (
 710          distance      : UInt32;
 711          length        : Match_length_range;
 712          give_up       : MProb;
 713          sim           : Machine_State
 714        )
 715        return MProb
 716        is
 717          pragma Inline (Test_Expanded_DL_Code);
 718          --  The following variable is discarded after the simulation,
 719          --  since we only test the DL code expansion for getting its probability.
 720          sim_var : Machine_State := sim;
 721          --
 722          prob : MProb := 1.0;
 723        begin
 724          Simulate_Expand_DL_code (distance, length, give_up, sim_var, prob);
 725          return prob;
 726        end Test_Expanded_DL_Code;
 727  
 728        --  Case of a DL code split into two shorter DL codes.
 729        procedure Test_Split_DL (
 730          distance        :     UInt32;
 731          length          :     Match_length_range;
 732          sim             :     Machine_State;
 733          hurdle          :     MProb;
 734          recursion_limit :     Natural;
 735          best_prob       : out MProb;
 736          best_cut        : out Match_length_range
 737        );
 738        pragma Inline (Test_Split_DL);
 739  
 740        procedure Generic_any_DL_Code (
 741          distance        :        UInt32;
 742          length          :        Match_length_range;
 743          sim             : in out Machine_State;
 744          prob            : in out MProb;
 745          recursion_limit :        Natural
 746        )
 747        is
 748          Copy_start : constant UInt32 := (sim.R - distance) and Text_Buf_Mask;
 749          strict_dlc, expanded_dlc, strict_or_expanded_dlc, dlc_after_lit, head_lit : MProb;
 750          b_head : Byte;
 751          sim_post_lit_pos_state : Pos_state_range;
 752          best_prob : MProb;
 753          best_cut  : Match_length_range;
 754          new_recursion_limit : Integer;
 755        begin
 756          if I_am_a_simulation then
 757            new_recursion_limit := recursion_limit - 1;
 758          else
 759            new_recursion_limit := recursion_limit;  --  We do not limit in actual emission.
 760          end if;
 761          if new_recursion_limit < 0 then
 762            Simulated_or_actual_Strict_DL_Code (distance, length, sim, prob);
 763            return;
 764          end if;
 765          if compare_variants >= Simple then
 766            strict_dlc             := Test_Strict_DL_Code (distance, length, sim);
 767            expanded_dlc           := Test_Expanded_DL_Code (distance, length, strict_dlc, sim);
 768            strict_or_expanded_dlc := MProb'Max (strict_dlc, expanded_dlc);
 769            --
 770            if length > Min_match_length then
 771              b_head   := Text_Buf (Copy_start and Text_Buf_Mask);
 772              head_lit := Test_Literal_Byte (b_head, sim);
 773              --  One literal, then a shorter DL code, case #1:
 774              --  naive approach: we spot a super-probable literal.
 775              if head_lit >= DL_Code_Erosion.Lit_then_DL_threshold then
 776                Simulated_or_actual_Literal_Byte (b_head, sim, prob);
 777                Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
 778                return;
 779              end if;
 780              --  One literal, then a shorter DL code, case #2:
 781              --  we estimate the shorter DL code's probability.
 782              sim_post_lit_pos_state := Pos_state_range (UInt32 (sim.total_pos + 1) and pos_bits_mask);
 783              dlc_after_lit :=
 784                Test_any_DL_Code (
 785                  distance, length - 1,
 786                  (Update_State_Literal (sim.state), sim_post_lit_pos_state, b_head,
 787                   (sim.R + 1) and Text_Buf_Mask, sim.total_pos + 1, sim.rep_dist),
 788                  new_recursion_limit
 789              );
 790              if head_lit * dlc_after_lit * DL_Code_Erosion.Malus_lit_then_DL (distance, length)
 791                > strict_or_expanded_dlc
 792              then
 793                Simulated_or_actual_Literal_Byte (b_head, sim, prob);
 794                Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
 795                return;
 796              end if;
 797              if DL_Code_Erosion.DL_code_then_Literal (distance, length, sim, new_recursion_limit)
 798                > strict_or_expanded_dlc
 799              then
 800                --  We've got a better probability -> redo this variant
 801                --  (shorter DL code, then literal) for good.
 802                Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
 803                Simulated_or_actual_Literal_Byte (Text_Buf ((sim.R - distance) and Text_Buf_Mask), sim, prob);
 804                return;
 805              end if;
 806            end if;
 807            --
 808            if expanded_dlc > strict_dlc then
 809              --  Here we prefer a full expansion of DL code as literals.
 810              for x in 1 .. length loop
 811                Simulated_or_actual_Literal_Byte (
 812                  Text_Buf ((Copy_start + UInt32 (x - 1)) and Text_Buf_Mask), sim, prob
 813                );
 814              end loop;
 815              return;
 816            end if;
 817          end if;
 818          if compare_variants >= Splitting then
 819            Test_Split_DL (
 820              distance, length,
 821              sim, strict_or_expanded_dlc, new_recursion_limit,
 822              best_prob, best_cut
 823            );
 824            if best_prob > strict_or_expanded_dlc then
 825              Generic_any_DL_Code (distance, best_cut,          sim, prob, new_recursion_limit);
 826              Generic_any_DL_Code (distance, length - best_cut, sim, prob, new_recursion_limit);
 827              return;
 828            end if;
 829          end if;
 830          --  At this point, we go for simulating or writing the plain DL code.
 831          Simulated_or_actual_Strict_DL_Code (distance, length, sim, prob);
 832        end Generic_any_DL_Code;
 833  
 834        --  We simulate here Write_any_DL_code, including the variants!
 835        procedure Simulate_any_DL_Code_Instance is new Generic_any_DL_Code
 836          (Simulated_or_actual_Literal_Byte   => Simulate_Literal_Byte,
 837           Simulated_or_actual_Strict_DL_Code => Simulate_Strict_DL_Code,
 838           I_am_a_simulation                  => True);
 839  
 840        procedure Simulate_any_DL_Code (
 841          distance        :        UInt32;
 842          length          :        Match_length_range;
 843          sim             : in out Machine_State;
 844          prob            : in out MProb;
 845          recursion_limit :        Natural
 846        )
 847        renames Simulate_any_DL_Code_Instance;
 848  
 849        function Test_any_DL_Code (
 850          distance        : UInt32;
 851          length          : Match_length_range;
 852          sim             : Machine_State;
 853          recursion_limit : Natural
 854        )
 855        return MProb
 856        is
 857          --  The following variable is discarded after the simulation,
 858          --  since we only test the DL code for getting its probability.
 859          sim_var : Machine_State := sim;
 860          --
 861          prob : MProb := 1.0;
 862        begin
 863          Simulate_any_DL_Code (distance, length, sim_var, prob, recursion_limit);
 864          return prob;
 865        end Test_any_DL_Code;
 866  
 867        package body DL_Code_Erosion is
 868          --
 869          function DL_code_then_Literal (
 870            distance        : UInt32;
 871            length          : Match_length_range;
 872            sim             : Machine_State;
 873            recursion_limit : Natural
 874          )
 875          return MProb
 876          is
 877            --  The following variable is discarded after the simulation,
 878            --  since we only test this variant for getting its probability.
 879            sim_var : Machine_State := sim;
 880            --  This "DL erosion" technique empirically works better for shorter distances and lengths.
 881            Malus_DL_then_lit : constant MProb :=
 882              MProb'Max (0.0, 0.135 - MProb'Base (distance) * 1.0e-8 - MProb'Base (length) * 1.0e-4);
 883            --
 884            prob : MProb := Malus_DL_then_lit;
 885          begin
 886            Simulate_any_DL_Code (distance, length - 1, sim_var, prob, recursion_limit);
 887            Simulate_Literal_Byte (Text_Buf ((sim_var.R - distance) and Text_Buf_Mask), sim_var, prob);
 888            return prob;
 889          end DL_code_then_Literal;
 890          --
 891          function Malus_lit_then_DL (distance : UInt32; length : Match_length_range) return MProb is
 892          begin
 893            --  This "DL erosion" technique empirically works better for shorter distances and lengths.
 894            return MProb'Max (0.0, 0.064 - MProb'Base (distance) * 1.0e-9 - MProb'Base (length) * 3.0e-5);
 895          end Malus_lit_then_DL;
 896          --
 897        end DL_Code_Erosion;
 898  
 899        subtype Splits_considered is Match_length_range range 4 .. 9;
 900  
 901        procedure Test_Split_DL (
 902          distance        :     UInt32;
 903          length          :     Match_length_range;
 904          sim             :     Machine_State;
 905          hurdle          :     MProb;
 906          recursion_limit :     Natural;
 907          best_prob       : out MProb;
 908          best_cut        : out Match_length_range
 909        )
 910        is
 911          sim_var : Machine_State := sim;
 912          --  For long distances, the DL split technique degrades compression and makes
 913          --  the compression time explode.
 914          Malus : constant MProb :=
 915            MProb'Max (0.0, 0.27 - MProb'Base (distance) * 2.0e-6);
 916          prob : MProb;
 917          lowered_recursion_limit : constant Natural := Integer'Max (0, recursion_limit - 1);
 918        begin
 919          best_prob := 0.0;
 920          best_cut  := Match_length_range'First;
 921          if Malus < hurdle then
 922            return;
 923          end if;
 924          for cut in 2 .. length - 2 loop  --  If length < 4 this loop is skipped.
 925            if cut in Splits_considered or else length - cut in Splits_considered then
 926              --  If we test all lengths the compression becomes too slow
 927              --  (huge number of combinations since recursion is involved).
 928              prob := Malus;
 929              sim_var := sim;  --  Set or reset simulation state.
 930              Simulate_any_DL_Code (distance, cut, sim_var, prob, lowered_recursion_limit);
 931              if prob <= hurdle then
 932                null;
 933                --  Give up this iteration, since the probability is already below the required
 934                --  level -> would be even lower after simulating the second DL code.
 935              else
 936                Simulate_any_DL_Code (distance, length - cut, sim_var, prob, lowered_recursion_limit);
 937                if prob > best_prob then
 938                  best_prob := prob;
 939                  best_cut := cut;
 940                end if;
 941              end if;
 942            end if;
 943          end loop;
 944        end Test_Split_DL;
 945  
 946      end Estimates;
 947  
 948      -------------------------------------
 949      --  Range encoding of single bits. --
 950      -------------------------------------
 951  
 952      type Range_Encoder is record
 953        width      : UInt32  := 16#FFFF_FFFF#;  --  (*)
 954        low        : UInt64  := 0;  --  The current range is [low, low+width[
 955        cache      : Byte    := 0;
 956        cache_size : UInt64  := 1;
 957      end record;
 958      --  (*) "width" is called "range" in LZMA spec and "remaining width" in G.N.N. Martin's
 959      --      article about range encoding.
 960  
 961      range_enc : Range_Encoder;
 962      encoded_uncompressed_bytes : UInt64 := 0;
 963  
 964      procedure Shift_low is
 965        --  Top 32 bits of the lower range bound.
 966        lb_top32    : constant UInt64 := Shift_Right (range_enc.low, 32);
 967        --  Bottom 32 bits of the lower range bound.
 968        lb_bottom32 : constant UInt32 := UInt32 (range_enc.low and 16#FFFF_FFFF#);
 969        temp, lb_bits_33_40 : Byte;
 970      begin
 971        if lb_bottom32 < 16#FF00_0000# or else lb_top32 /= 0 then
 972          --  Flush range_enc.cache_size bytes, based on only
 973          --  2 byte values: range_enc.cache and lb_bits_33_40.
 974          --  The mechanism is a bit obscure (seems to be a carry)...
 975          temp := range_enc.cache;
 976          lb_bits_33_40 := Byte (lb_top32 and 16#FF#);
 977          loop
 978            Write_Byte (temp + lb_bits_33_40);  --  Finally a byte is output sometimes!
 979            temp := 16#FF#;
 980            range_enc.cache_size := range_enc.cache_size - 1;
 981            exit when range_enc.cache_size = 0;
 982          end loop;
 983          range_enc.cache := Byte (Shift_Right (lb_bottom32, 24) and 16#FF#);  --  bits 25 to 32
 984        end if;
 985        range_enc.cache_size := range_enc.cache_size + 1;
 986        --  Bits 25 to 32 are erased and the trailing zeroes are added.
 987        range_enc.low := UInt64 (Shift_Left (lb_bottom32, 8));
 988      end Shift_low;
 989  
 990      procedure Flush_range_encoder is
 991      begin
 992        for i in 1 .. 5 loop
 993          Shift_low;
 994        end loop;
 995      end Flush_range_encoder;
 996  
 997      --  Normalize corresponds to G.N.N. Martin's revised algorithm's adding
 998      --  of trailing digits (zeroes). The leftmost digits of the range don't
 999      --  change anymore and can be output.
1000      --
1001      procedure Normalize is
1002      pragma Inline (Normalize);
1003      begin
1004        if range_enc.width < width_threshold then
1005          range_enc.width := Shift_Left (range_enc.width, 8);  --  Trailing zeroes are added to width.
1006          Shift_low;
1007        end if;
1008      end Normalize;
1009  
1010      procedure Encode_Bit (prob : in out CProb; symbol : in Unsigned) is
1011      pragma Inline (Encode_Bit);
1012        cur_prob : constant CProb := prob;  --  Local copy
1013        --  The current interval is [low, high=low+width[ .
1014        --  The bound is between 0 and width, closer to 0 if prob
1015        --  is small, closer to width if prob is large.
1016        bound : constant UInt32 := Shift_Right (range_enc.width, probability_model_bits) * UInt32 (cur_prob);
1017      begin
1018        if symbol = 0 then
1019          --  Left sub-interval, for symbol 0: [low, low+bound[ .
1020          --  Set new range. low is unchanged, high is new.
1021          range_enc.width := bound;
1022          Normalize;
1023          --  Increase probability.
1024          --  The truncation ensures that prob <= Probability_model_count - (2**m - 1). See note (*).
1025          prob := cur_prob + Shift_Right (probability_model_count - cur_prob, probability_change_bits);
1026        else
1027          --  Right sub-interval, for symbol 1: [low+bound, high=low+width[ .
1028          --  Set new range. low is new, high is unchanged.
1029          range_enc.low := range_enc.low + UInt64 (bound);
1030          range_enc.width := range_enc.width - bound;
1031          Normalize;
1032          --  Decrease probability: prob:= prob - {prob / 2**m}, approx. equal to prob * (1 - 2**m).
1033          --  The truncation represented by {} ensures that prob >= 2**m - 1. See note (*).
1034          prob := cur_prob - Shift_Right (cur_prob, probability_change_bits);
1035        end if;
1036        --  (*) It can be checked exhaustively that it is always the case.
1037        --      A too low prob could cause the width to be too small or even zero.
1038        --      Same for "too high". See LZMA sheet in za_work.xls.
1039      end Encode_Bit;
1040  
1041      -----------------------------------------------------------------------------------
1042      --  This part processes the case where LZ77 sends a literal (a plain text byte)  --
1043      -----------------------------------------------------------------------------------
1044  
1045      procedure Write_Literal (prob : in out CProb_array; symbol : in UInt32) is
1046      pragma Inline (Write_Literal);
1047        symb : UInt32 := symbol or 16#100#;
1048      begin
1049        loop
1050          Encode_Bit ( --  Prob. offset is always 1, 2, 4, 8, .. , 128
1051            prob   => prob (Integer (Shift_Right (symb, 8)) + prob'First),
1052            symbol => Unsigned (Shift_Right (symb, 7)) and 1
1053          );
1054          symb := Shift_Left (symb, 1);
1055          exit when symb >= 16#10000#;
1056        end loop;
1057      end Write_Literal;
1058  
1059      procedure Write_Literal_Matched (prob : in out CProb_array; symbol, matched : in UInt32) is
1060      pragma Inline (Write_Literal_Matched);
1061        symb  : UInt32 := symbol or 16#100#;
1062        offs  : UInt32 := 16#100#;
1063        match : UInt32 := matched;
1064      begin
1065        loop
1066          match := Shift_Left (match, 1);
1067          Encode_Bit (
1068            prob   => prob (Integer (offs + (match and offs) + Shift_Right (symb, 8)) + prob'First),
1069            symbol => Unsigned (Shift_Right (symb, 7)) and 1
1070          );
1071          symb := Shift_Left (symb, 1);
1072          offs := offs and not (match xor symb);
1073          exit when symb >= 16#10000#;
1074        end loop;
1075      end Write_Literal_Matched;
1076  
1077      use type Estimates.MProb;
1078  
1079      --  Encoder State: state of the real LZMA encoder - data is written here, no simulation!
1080      ES : Machine_State :=
1081        (R          => 0,
1082         prev_byte  => 0,
1083         total_pos  => 0,
1084         rep_dist   => (others => 0),
1085         state      => 0,
1086         pos_state  => 0
1087        );
1088  
1089      max_recursion : constant := 2;
1090  
1091      procedure Update_pos_state is
1092      pragma Inline (Update_pos_state);
1093      begin
1094        ES.pos_state := Pos_state_range (UInt32 (ES.total_pos) and pos_bits_mask);
1095      end Update_pos_state;
1096  
1097      procedure LZ77_emits_literal_byte (b : Byte) is
1098        pb_lit_idx : constant Integer := Idx_for_Literal_prob (ES.total_pos, ES.prev_byte);
1099        b_match : constant Byte := Text_Buf ((ES.R - ES.rep_dist (0) - 1) and Text_Buf_Mask);
1100      begin
1101        if b = b_match and then ES.total_pos > Data_Bytes_Count (ES.rep_dist (0) + 1)
1102          and then
1103            (compare_variants = None
1104               or else
1105             Estimates.Test_Short_Rep_Match (ES) >
1106             Estimates.Test_Simple_Literal (b, b_match, probs.lit (pb_lit_idx .. probs.lit'Last), ES))
1107        then
1108          --  We are lucky: both bytes are the same. No literal to encode, "Short Rep Match"
1109          --  case, and its cost (4 bits) is more affordable than the literal's cost.
1110          Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1111          Encode_Bit (probs.switch.rep (ES.state), Rep_match_choice);
1112          Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_rep0_choice);
1113          Encode_Bit (probs.switch.rep0_long (ES.state, ES.pos_state), The_length_is_1_choice);
1114          ES.state := Update_State_ShortRep (ES.state);
1115        else
1116          Encode_Bit (probs.switch.match (ES.state, ES.pos_state), Literal_choice);
1117          if ES.state < 7 then
1118            Write_Literal (probs.lit (pb_lit_idx .. probs.lit'Last), UInt32 (b));
1119          else
1120            Write_Literal_Matched (probs.lit (pb_lit_idx .. probs.lit'Last), UInt32 (b), UInt32 (b_match));
1121          end if;
1122          ES.state := Update_State_Literal (ES.state);
1123        end if;
1124        ES.total_pos := ES.total_pos + 1;
1125        Update_pos_state;
1126        ES.prev_byte := b;
1127        Text_Buf (ES.R) := b;
1128        ES.R := (ES.R + 1) and Text_Buf_Mask;  --  This is mod String_buffer_size
1129        encoded_uncompressed_bytes := encoded_uncompressed_bytes + 1;
1130      end LZ77_emits_literal_byte;
1131  
1132      procedure Write_Literal_Byte (
1133        b          :        Byte;
1134        dummy_sim  : in out Machine_State;
1135        dummy_prob : in out Estimates.MProb)
1136      is
1137      begin
1138        LZ77_emits_literal_byte (b);
1139      end Write_Literal_Byte;
1140  
1141      ---------------------------------------------------------------------------------
1142      --  This part processes the case where LZ77 sends a Distance-Length (DL) code  --
1143      ---------------------------------------------------------------------------------
1144  
1145      procedure Bit_Tree_Encode (
1146        prob     : in out CProb_array;
1147        num_bits :        Positive;
1148        symbol   :        Unsigned)
1149      is
1150        bit, m : Unsigned;
1151      begin
1152        m := 1;
1153        for i in reverse 0 .. num_bits - 1 loop
1154          bit := Unsigned (Shift_Right (UInt32 (symbol), i)) and 1;
1155          Encode_Bit (prob (Integer (m) + prob'First), bit);
1156          m := 2 * m + bit;
1157        end loop;
1158      end Bit_Tree_Encode;
1159  
1160      procedure Encode_Length (probs_len : in out Probs_for_LZ_Lengths; length : Unsigned) is
1161        len : Unsigned := length - Min_match_length;
1162      begin
1163        if len < Len_low_symbols then
1164          Encode_Bit (probs_len.choice_1, 0);
1165          --  LZ length in [2..9], i.e. len in [0..7]
1166          Bit_Tree_Encode (probs_len.low_coder (ES.pos_state), Len_low_bits, len);
1167        else
1168          Encode_Bit (probs_len.choice_1, 1);
1169          len := len - Len_low_symbols;
1170          if len < Len_mid_symbols then
1171            Encode_Bit (probs_len.choice_2, 0);
1172            --  LZ length in [10..17], i.e. len in [0..7]
1173            Bit_Tree_Encode (probs_len.mid_coder (ES.pos_state), Len_mid_bits, len);
1174          else
1175            Encode_Bit (probs_len.choice_2, 1);
1176            len := len - Len_mid_symbols;
1177            --  LZ length in [18..273], i.e. len in [0..255]
1178            Bit_Tree_Encode (probs_len.high_coder, Len_high_bits, len);
1179          end if;
1180        end if;
1181      end Encode_Length;
1182  
1183      procedure Write_Simple_Match (dist_ip : UInt32; length : Unsigned) is
1184        --
1185        procedure Bit_Tree_Reverse_Encode (
1186          prob     : in out CProb_array;
1187          num_bits : in     Natural;
1188          symbol   : in     UInt32
1189        )
1190        is
1191          symb : UInt32 := symbol;
1192          m : Unsigned := 1;
1193          bit : Unsigned;
1194        begin
1195          for count_bits in reverse 1 .. num_bits loop
1196            bit := Unsigned (symb) and 1;
1197            Encode_Bit (prob (Integer (m) + prob'First), bit);
1198            m := 2 * m + bit;
1199            symb := Shift_Right (symb, 1);
1200          end loop;
1201        end Bit_Tree_Reverse_Encode;
1202  
1203        --  Range encoding of num_bits with equiprobability.
1204        --
1205        procedure Encode_Direct_Bits (value : UInt32; num_bits : Natural) is
1206        begin
1207          for i in reverse 0 .. num_bits - 1 loop
1208            --  Bound is the half width. New width is halved anyway.
1209            range_enc.width := Shift_Right (range_enc.width, 1);
1210            --  Either low is unchanged (bit=0), or new low := old low + bound (bit=1).
1211            range_enc.low := range_enc.low +
1212              (UInt64 (range_enc.width) and (0 - UInt64 (Shift_Right (value, i) and 1)));
1213            Normalize;
1214          end loop;
1215        end Encode_Direct_Bits;
1216        --
1217        procedure Encode_Distance is
1218          len_state : constant Unsigned := Unsigned'Min (length - 2, len_to_pos_states - 1);
1219          dist_slot : constant Unsigned := Get_dist_slot (dist_ip);
1220          base, dist_reduced : UInt32;
1221          footerBits : Natural;
1222        begin
1223          Bit_Tree_Encode (probs.dist.slot_coder (len_state), Dist_slot_bits, dist_slot);
1224          if dist_slot >= Start_dist_model_index then
1225            footerBits := Natural (Shift_Right (UInt32 (dist_slot), 1)) - 1;
1226            base := Shift_Left (UInt32 (2 or (dist_slot and 1)), footerBits);
1227            dist_reduced := dist_ip - base;
1228            if dist_slot < End_dist_model_index then
1229              Bit_Tree_Reverse_Encode (
1230                probs.dist.pos_coder (Integer (base) - Integer (dist_slot) - 1 .. Pos_coder_range'Last),
1231                footerBits,
1232                dist_reduced
1233              );
1234            else
1235              Encode_Direct_Bits (Shift_Right (dist_reduced, align_bits), footerBits - align_bits);
1236              Bit_Tree_Reverse_Encode (
1237                probs.dist.align_coder,
1238                align_bits,
1239                dist_reduced and align_mask
1240              );
1241            end if;
1242          end if;
1243        end Encode_Distance;
1244        --
1245      begin
1246        Encode_Bit (probs.switch.rep (ES.state), Simple_match_choice);
1247        ES.state := Update_State_Match (ES.state);
1248        Encode_Length (probs.len, length);
1249        Encode_Distance;
1250        --  Shift the stack of recent distances; the new distance becomes the first item.
1251        for i in reverse 1 .. Repeat_stack_range'Last loop
1252          ES.rep_dist (i) := ES.rep_dist (i - 1);
1253        end loop;
1254        ES.rep_dist (0) := dist_ip;
1255      end Write_Simple_Match;
1256  
1257      procedure Write_Repeat_Match (index_rm : Repeat_stack_range; length : Unsigned) is
1258        aux : UInt32;
1259      begin
1260        Encode_Bit (probs.switch.rep (ES.state), Rep_match_choice);
1261        case index_rm is
1262          when 0 =>
1263            Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_rep0_choice);
1264            Encode_Bit (probs.switch.rep0_long (ES.state, ES.pos_state), The_length_is_not_1_choice);
1265          when 1 =>
1266            Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1267            Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_rep1_choice);
1268          when 2 =>
1269            Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1270            Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_not_rep1_choice);
1271            Encode_Bit (probs.switch.rep_g2 (ES.state), The_distance_is_rep2_choice);
1272          when 3 =>
1273            Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1274            Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_not_rep1_choice);
1275            Encode_Bit (probs.switch.rep_g2 (ES.state), The_distance_is_not_rep2_choice);
1276        end case;
1277        --  Roll the stack of recent distances up to the found item, which becomes the first one.
1278        aux := ES.rep_dist (index_rm);
1279        for i in reverse 1 .. index_rm loop
1280          ES.rep_dist (i) := ES.rep_dist (i - 1);
1281        end loop;
1282        ES.rep_dist (0) := aux;
1283        --
1284        Encode_Length (probs.rep_len, length);
1285        ES.state := Update_State_Rep (ES.state);
1286      end Write_Repeat_Match;
1287  
1288      procedure Write_Strict_DL_Code (
1289        distance   :        UInt32;
1290        length     :        Match_length_range;
1291        dummy_sim  : in out Machine_State;
1292        dummy_prob : in out Estimates.MProb
1293      )
1294      is
1295        dist_ip : constant UInt32 := UInt32 (distance - 1);  --  7-Zip distance convention (minus 1)
1296        found_repeat : Integer := Repeat_Stack'First - 1;
1297      begin
1298        pragma Assert (
1299          UInt64 (distance) <= encoded_uncompressed_bytes,
1300          "distance goes before input stream's begin"
1301        );
1302        Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1303        for i in Repeat_Stack'Range loop
1304          if dist_ip = ES.rep_dist (i) then
1305            found_repeat := i;
1306            exit;
1307            --  NB: it's possible to pick the most probable duplicate instead, but without clear gain
1308          end if;
1309        end loop;
1310        if found_repeat >= Repeat_Stack'First
1311          and then
1312            (compare_variants = None
1313               or else
1314             Estimates.Test_Repeat_Match (found_repeat, Unsigned (length), ES)
1315             >=
1316             Estimates.Test_Simple_Match (dist_ip, Unsigned (length), ES) *
1317             Estimates.Malus_simple_match_vs_rep
1318            )
1319        then
1320          Write_Repeat_Match (found_repeat, Unsigned (length));
1321        else
1322          Write_Simple_Match (dist_ip, Unsigned (length));
1323        end if;
1324        ES.total_pos := ES.total_pos + Data_Bytes_Count (length);
1325        Update_pos_state;
1326        ES.R := ES.R + UInt32 (length) and Text_Buf_Mask;  --  This is mod String_buffer_size
1327        ES.prev_byte := Text_Buf ((ES.R - 1) and Text_Buf_Mask);
1328      end Write_Strict_DL_Code;
1329  
1330      --  All the smart things to optimize the probability model by breaking
1331      --  DL codes is done in the following procedure:
1332      --
1333      procedure Write_any_DL_code is new Estimates.Generic_any_DL_Code
1334        (Simulated_or_actual_Literal_Byte   => Write_Literal_Byte,
1335         Simulated_or_actual_Strict_DL_Code => Write_Strict_DL_Code,
1336         I_am_a_simulation                  => False);
1337  
1338      procedure Expand_DL_Code_to_Buffer (
1339        sim      : Machine_State;
1340        distance : Integer;
1341        length   : Match_length_range
1342      )
1343      is
1344        Rx : UInt32 := sim.R;
1345        Copy_start : constant UInt32 := (sim.R - UInt32 (distance)) and Text_Buf_Mask;
1346      begin
1347        --  Expand early into the circular "text" buffer to have it up to date
1348        --  and available to simulations.
1349        for K in 0 .. UInt32 (length - 1) loop
1350          Text_Buf (Rx) := Text_Buf ((Copy_start + K) and Text_Buf_Mask);
1351          Rx := (Rx + 1) and Text_Buf_Mask;  --  This is mod String_buffer_size
1352        end loop;
1353      end Expand_DL_Code_to_Buffer;
1354  
1355      procedure LZ77_emits_DL_code (distance : Integer; length : Match_length_range) is
1356        dummy_prob : Estimates.MProb := 0.0;
1357      begin
1358        Expand_DL_Code_to_Buffer (ES, distance, length);
1359        encoded_uncompressed_bytes := encoded_uncompressed_bytes + UInt64 (length);
1360        Write_any_DL_code (UInt32 (distance), length, ES, dummy_prob, max_recursion);
1361      end LZ77_emits_DL_code;
1362  
1363      procedure Estimate_DL_Codes_for_LZ77 (
1364        matches          : in out LZ77.Matches_Array;
1365        old_match_index  : in     Natural;
1366        prefixes         : in     LZ77.Byte_Array;
1367        best_score_index :    out Positive;
1368        best_score_set   :    out LZ77.Prefetch_Index_Type;
1369        match_trace      :    out LZ77.DLP_Array
1370      )
1371      is
1372      pragma Unreferenced (match_trace);
1373        use Estimates;
1374        last_pos_any_DL : Natural := 0;
1375        sim_new : Machine_State := ES;
1376        offset_new_match_set : constant array (Boolean) of Natural := (0, 1);
1377        head_lit_prob : MProb;
1378        --
1379        --  Compare different ways of sending DL codes starting with
1380        --  DL (i), with a total length 'last_pos_any_DL', in order to
1381        --  compare sequences of the same length starting with the different
1382        --  matches in DL_old and DL_new. The matches in DL_new are preceded
1383        --  by the literal 'head_literal_new'.
1384        --
1385        procedure Scoring (
1386          state           :     Machine_State;
1387          start           :     Positive;
1388          recursion_level :     Positive;
1389          prob            : out MProb;
1390          index           : out Positive;
1391          match_set       : out LZ77.Prefetch_Index_Type
1392        )
1393        is
1394          --  We compute the probability of the message sent for the bytes
1395          --  at position 'start' to the position 'last_pos_any_DL' and find the
1396          --  optimal combination using the matches in the DL array.
1397          prob_i, tail_prob : MProb;
1398          test_state : Machine_State;
1399          length_trunc, some_index : Positive;
1400          some_match_set : LZ77.Prefetch_Index_Type;
1401          last_pos_i : Integer;
1402        begin
1403          prob := 0.0;
1404          for m in matches'Range loop
1405            for i in 1 .. matches (m).count loop
1406              last_pos_i := matches (m).dl (i).length + offset_new_match_set (m /= old_match_index);
1407              if last_pos_i >= start then
1408                if last_pos_i < last_pos_any_DL and recursion_level >= 2 then
1409                  --  Skip case requiring insane recursion level: level = number of DL codes chained!
1410                  null;
1411                else
1412                  if m /= old_match_index and then start = 1 then
1413                    test_state := sim_new;  --  Shortcut to avoid resimulating the head literal.
1414                    prob_i := head_lit_prob;
1415                  else
1416                    test_state := state;  --  Clone the current state (general case including recursion).
1417                    prob_i := 1.0;
1418                  end if;
1419                  --
1420                  --  'length_trunc' is what remains of the DL code, DL (i), to be consumed.
1421                  --
1422                  if m = old_match_index then
1423                    --  Easy case: we execute one of the "old" matches.
1424                    length_trunc := matches (m).dl (i).length   - start + 1; --  always >= 1
1425                  elsif start = 1 then
1426                    --  We execute the full new DL code after the head literal.
1427                    length_trunc := matches (m).dl (i).length;
1428                  else  --  start >= 2. (2: full DL, 3: truncate by 1, etc.)
1429                    length_trunc := matches (m).dl (i).length   - start + 2;  --  always >= 1
1430                  end if;
1431                  pragma Assert (length_trunc >= 1);
1432                  --
1433                  if length_trunc = 1 then
1434                    --  Just simulate the literal we are sitting on: the buffer
1435                    --  has been already filled via Expand_DL_Code_to_Buffer.
1436                    --  It is a shortcut for and should be equivalent to the position checked below.
1437                    pragma Assert (
1438                      Text_Buf (state.R) =
1439                      Text_Buf ((state.R - UInt32 (matches (m).dl (i).distance)) and Text_Buf_Mask),
1440                      "Bytes of simulated copy do not match; start =" & Integer'Image (start) &
1441                      "; DL code distance="  & LZ77.Distance_Type'Image (matches (m).dl (i).distance) &
1442                      "; new match set="  & Boolean'Image (m /= old_match_index)
1443                    );
1444                    Simulate_Literal_Byte (
1445                      Text_Buf (state.R),
1446                      test_state,
1447                      prob_i);
1448                  else  --  Here, length_trunc >= 2
1449                    Simulate_any_DL_Code (
1450                      distance        => UInt32 (matches (m).dl (i).distance),
1451                      length          => length_trunc,
1452                      sim             => test_state,
1453                      prob            => prob_i,
1454                      recursion_limit => 1);
1455                  end if;
1456                  if last_pos_i < last_pos_any_DL then
1457                    Scoring (test_state, last_pos_i + 1, recursion_level + 1, tail_prob, some_index, some_match_set);
1458                    prob_i := prob_i * tail_prob;
1459                  end if;
1460                  if prob_i > prob then
1461                    prob      := prob_i;
1462                    index     := i;
1463                    match_set := m;
1464                  end if;
1465                end if;
1466              end if;
1467            end loop;
1468          end loop;
1469        end Scoring;
1470        --
1471        best_prob : MProb;
1472        new_wins : Boolean := False;
1473        last_pos_single_DL : Natural;
1474        match_for_max_last_pos : LZ77.Distance_Length_Pair;
1475        sim_expand : Machine_State := ES;
1476        sim_old : constant Machine_State := ES;
1477      begin
1478        for m in matches'Range loop
1479          for i in 1 .. matches (m).count loop
1480            last_pos_single_DL := matches (m).dl (i).length + offset_new_match_set (m /= old_match_index);
1481            if last_pos_single_DL > last_pos_any_DL then
1482              last_pos_any_DL := last_pos_single_DL;
1483              match_for_max_last_pos := matches (m).dl (i);
1484              new_wins := m /= old_match_index;
1485            end if;
1486          end loop;
1487        end loop;
1488        if new_wins then  --  Copy the literal to the buffer.
1489          Text_Buf (sim_expand.R) := prefixes (1);
1490          sim_expand.R := (sim_expand.R + 1) and Text_Buf_Mask;
1491        end if;
1492        Expand_DL_Code_to_Buffer (sim_expand, match_for_max_last_pos.distance, match_for_max_last_pos.length);
1493        --
1494        head_lit_prob := 1.0;
1495        Simulate_Literal_Byte (prefixes (1), sim_new, head_lit_prob);
1496        --
1497        Scoring (sim_old, 1, 1, best_prob, best_score_index, best_score_set);
1498      end Estimate_DL_Codes_for_LZ77;
1499  
1500      procedure My_LZ77 is
1501        new LZ77.Encode
1502           (String_buffer_size => String_buffer_size (level),
1503            Look_Ahead         => Max_length (level),
1504            Threshold          => Min_length (level) - 1,
1505            Method             => LZ77_choice (level),
1506            Read_Byte          => Read_Byte,
1507            More_Bytes         => More_Bytes,
1508            Write_Literal      => LZ77_emits_literal_byte,
1509            Write_DL_Code      => LZ77_emits_DL_code,
1510            Estimate_DL_Codes  => Estimate_DL_Codes_for_LZ77
1511          );
1512  
1513      procedure Write_LZMA_header is
1514        dw : UInt32 := params.dict_size;
1515        uw : Data_Bytes_Count := params.unpack_size;
1516      begin
1517        --  5-byte header
1518        Write_Byte (Byte (params.lc + 9 * params.lp + 9 * 5 * params.pb));
1519        for i in 0 .. 3 loop
1520          Write_Byte (Byte (dw mod 256));
1521          dw := dw / 256;
1522        end loop;
1523        --  8 bytes for unpacked size.
1524        --  This part of the header is optional => you need a
1525        --  prior knowledge or a "pre-header" indicating its presence or not.
1526        if params.header_has_size then
1527          for i in 0 .. 7 loop
1528            if params.unpack_size_defined then
1529              Write_Byte (Byte (uw mod 256));
1530              uw := uw / 256;
1531            else
1532              Write_Byte (16#FF#);
1533            end if;
1534          end loop;
1535        end if;
1536      end Write_LZMA_header;
1537  
1538    begin
1539      case level is
1540        when Level_0 | Level_1 =>
1541          compare_variants := None;
1542        when Level_2  =>
1543          compare_variants := Simple;
1544        when Level_3 =>
1545          compare_variants := Splitting;
1546      end case;
1547      Write_LZMA_header;
1548      My_LZ77;
1549      if params.has_end_mark then
1550        --  The end-of-stream marker is a fake "Simple Match" with a special distance.
1551        Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1552        Write_Simple_Match (
1553          dist_ip => end_of_stream_magic_distance,
1554          length  => Min_match_length
1555        );
1556      end if;
1557      Flush_range_encoder;
1558      Dispose (Text_Buf);
1559    exception
1560      when others =>
1561        Dispose (Text_Buf);
1562        raise;
1563    end Encode;
1564  
1565  end LZMA.Encoding;

Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other Ada projects on Gautier's blog.