Source file : lzma-encoding.adb
1 -- LZMA.Encoding - a standalone, generic LZMA encoder.
2 -- Author: G. de Montmollin (except parts mentioned below (*)).
3 --
4 -- This encoder was built mostly by mirroring from LZMA.Decoding upon
5 -- the format's symmetries between encoding and decoding. For instance,
6 --
7 -- Bit_Tree_Decode(probs_len.low_coder(pos_state), Len_low_bits, len);
8 -- becomes:
9 -- Bit_Tree_Encode(probs_len.low_coder(pos_state), Len_low_bits, len);
10 --
11 -- Furthermore, cases for which there are alternatives are decided by comparing
12 -- their respective probabilities (search "MProb" in the code).
13 --
14 -- (*) The base mechanism (the encoding of range, literals and DL codes)
15 -- is from the original LzmaEnc.c by Igor Pavlov.
16 -- The Get_dist_slot function is from the LZMAEncoder.java by Lasse Collin.
17
18 -- Legal licensing note:
19
20 -- Copyright (c) 2016 .. 2020 Gautier de Montmollin
21 -- SWITZERLAND
22
23 -- Permission is hereby granted, free of charge, to any person obtaining a copy
24 -- of this software and associated documentation files (the "Software"), to deal
25 -- in the Software without restriction, including without limitation the rights
26 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27 -- copies of the Software, and to permit persons to whom the Software is
28 -- furnished to do so, subject to the following conditions:
29
30 -- The above copyright notice and this permission notice shall be included in
31 -- all copies or substantial portions of the Software.
32
33 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
39 -- THE SOFTWARE.
40
41 -- NB: this is the MIT License, as found on the site
42 -- http://www.opensource.org/licenses/mit-license.php
43
44 --
45 -- Change log:
46 --------------
47 --
48 -- 18-Aug-2016: Fully functional.
49 -- 28-Jul-2016: Created.
50
51 with LZ77;
52
53 with Ada.Unchecked_Deallocation;
54
55 package body LZMA.Encoding is
56
57 use type Data_Bytes_Count;
58
59 procedure Encode
60 (level : Compression_Level := Level_1;
61 literal_context_bits : Literal_Context_Bits_Range := 3; -- Bits of last byte are used.
62 literal_position_bits : Literal_Position_Bits_Range := 0; -- Position mod 2**bits is used.
63 position_bits : Position_Bits_Range := 2; -- Position mod 2**bits is used.
64 end_marker : Boolean := True; -- Produce an End-Of-Stream marker ?
65 uncompressed_size_info : Boolean := False; -- Optional extra header needed for .lzma files.
66 -- In LZMA.Decoding, type LZMA_Hints: has_size.
67 dictionary_size : Natural := Default_dictionary_size) -- Not used by Level_1, Level_2.
68 is
69
70 -- Gets an integer [0, 63] matching the highest two bits of an integer.
71 -- It is a log2 function with one "decimal".
72 --
73 function Get_dist_slot (dist : UInt32) return Unsigned is
74 n : UInt32;
75 i : Natural;
76 begin
77 if dist <= Start_dist_model_index then
78 return Unsigned (dist);
79 end if;
80 n := dist;
81 i := 31;
82 if (n and 16#FFFF_0000#) = 0 then
83 n := Shift_Left (n, 16);
84 i := 15;
85 end if;
86 if (n and 16#FF00_0000#) = 0 then
87 n := Shift_Left (n, 8);
88 i := i - 8;
89 end if;
90 if (n and 16#F000_0000#) = 0 then
91 n := Shift_Left (n, 4);
92 i := i - 4;
93 end if;
94 if (n and 16#C000_0000#) = 0 then
95 n := Shift_Left (n, 2);
96 i := i - 2;
97 end if;
98 if (n and 16#8000_0000#) = 0 then
99 i := i - 1;
100 end if;
101 return Unsigned (i * 2) + Unsigned (Shift_Right (dist, i - 1) and 1);
102 end Get_dist_slot;
103
104 -- Round to the next power of two. BT4 borks without this for the window size.
105 function Ceiling_power_of_2 (x : Natural) return Positive is
106 p : Positive := 1;
107 begin
108 while p < Integer'Last / 2 and p < x loop
109 p := p * 2;
110 end loop;
111 return Integer'Max (p, x);
112 end Ceiling_power_of_2;
113
114 -----------------------------------
115 -- LZ77 compression parameters --
116 -----------------------------------
117
118 LZ77_choice : constant array (Compression_Level) of LZ77.Method_Type :=
119 (Level_0 => LZ77.No_LZ77, -- We don't do any LZ77 for level 0
120 Level_1 => LZ77.IZ_6,
121 Level_2 => LZ77.IZ_10,
122 Level_3 => LZ77.BT4);
123
124 Min_length : constant array (Compression_Level) of Positive :=
125 (Level_1 | Level_2 => 3, -- Deflate's minimum value
126 others => 2); -- LZMA's minimum value
127
128 Max_length : constant array (Compression_Level) of Positive :=
129 (Level_1 | Level_2 => 258, -- Deflate's maximum value
130 others => 273); -- LZMA's maximum value
131
132 extra_size : constant := 273 + 1 + LZ77.BT4_max_prefetch_positions;
133 -- Extra space is used for DL codes scoring before being
134 -- sent for real to the encoder.
135
136 -- String_buffer_size: the actual dictionary size used.
137 String_buffer_size : constant array (Compression_Level) of Positive :=
138 (Level_0 => 16, -- Fake: actually we don't use any LZ77 for level 0
139 Level_1 | Level_2 => 2 ** 15, -- Deflate's Value: 32 KiB
140 Level_3 =>
141 Integer'Max (
142 Min_dictionary_size, -- minimum: 4 KiB
143 Integer'Min (
144 -- dictionary_size is specified; default is 32 KiB
145 Ceiling_power_of_2 (dictionary_size + extra_size),
146 2 ** 28 -- maximum: 256 MiB
147 )
148 )
149 );
150
151 -----------------------------------------------------------
152 -- The LZMA "machine": here the LZ codes are processed --
153 -- and sent to the above bit encoder in a smart way. --
154 -----------------------------------------------------------
155
156 type LZMA_Params_Info is record
157 unpack_size : Data_Bytes_Count := 0;
158 -- unpack_size_defined is always False in this implementation:
159 -- size is not known in advance and the header cannot be
160 -- rewritten when processing is done.
161 unpack_size_defined : Boolean := False;
162 header_has_size : Boolean := uncompressed_size_info;
163 has_end_mark : Boolean := end_marker;
164 dict_size : UInt32 := UInt32 (String_buffer_size (level));
165 lc : Literal_Context_Bits_Range := literal_context_bits;
166 lp : Literal_Position_Bits_Range := literal_position_bits;
167 pb : Position_Bits_Range := position_bits;
168 end record;
169
170 params : LZMA_Params_Info;
171
172 -- Small stack of recent distances used for LZ. Required: initialized with zero values.
173 -- lzma-specification.txt: "That set of 4 variables contains zero-based match
174 -- distances and these variables are initialized with zero values"
175 --
176 subtype Repeat_stack_range is Integer range 0 .. 3;
177 type Repeat_Stack is array (Repeat_stack_range) of UInt32;
178 --
179 probs : All_probabilities (last_lit_prob_index => 16#300# * 2 ** (params.lc + params.lp) - 1);
180 pos_bits_mask : constant UInt32 := 2 ** params.pb - 1;
181 literal_pos_mask : constant UInt32 := 2 ** params.lp - 1;
182
183 -- We expand the DL codes in order to have some past data.
184 subtype Text_Buffer_Index is UInt32 range 0 .. UInt32 (String_buffer_size (level) - 1);
185 type Text_Buffer is array (Text_Buffer_Index) of Byte;
186 Text_Buf_Mask : constant UInt32 := UInt32 (String_buffer_size (level) - 1);
187 -- NB: heap allocation (and then, the only pointer in this package) is used
188 -- only for convenience because of small default stack sizes on some compilers.
189 type p_Text_Buffer is access Text_Buffer;
190 procedure Dispose is new Ada.Unchecked_Deallocation (Text_Buffer, p_Text_Buffer);
191 Text_Buf : p_Text_Buffer := new Text_Buffer;
192
193 function Idx_for_Literal_prob (position : Data_Bytes_Count; prev_byte : Byte) return Integer is
194 pragma Inline (Idx_for_Literal_prob);
195 begin
196 return 16#300# *
197 Integer (
198 Shift_Left (UInt32 (position) and literal_pos_mask, params.lc) +
199 Shift_Right (UInt32 (prev_byte), 8 - params.lc)
200 );
201 end Idx_for_Literal_prob;
202
203 type Variants_Comparison_Choice is
204 (
205 None, -- "Mechanical" encoding, straight from the LZ77 algorithm.
206 Simple, -- Compare simple alternative encodings and choose the most probable.
207 Splitting -- More advanced search for alternatives.
208 );
209
210 compare_variants : Variants_Comparison_Choice;
211
212 type Machine_State is record
213 state : State_range;
214 pos_state : Pos_state_range;
215 prev_byte : Byte;
216 R : UInt32;
217 total_pos : Data_Bytes_Count;
218 rep_dist : Repeat_Stack;
219 end record;
220
221 -------------------------
222 -- Package Estimates --
223 -------------------------
224 --
225 -- Purpose: estimate probabilities of different alternative
226 -- encodings, in order to choose the most probable encoding.
227 -- Note that the LZMA encoder is already very efficient by
228 -- taking the obvious choices. It is possible to ignore this
229 -- package and its uses (see occurrences of "compare_variants").
230 --
231 -- In the following probability computations, we assume independent
232 -- (multiplicative) probabilities, just like the range encoder does
233 -- when adapting the range width. With higher probabilities, the width
234 -- will decrease less and the compression will be better.
235 -- Since the probability model is constantly adapting, we have kind of self-fulfilling
236 -- predictions - e.g. if a Short Rep Match is chosen against a Literal, the context
237 -- probabilities of the former will be increased instead of the latter.
238
239 package Estimates is
240 type MProb is digits 15 range 0.0 .. 1.0;
241 --
242 -- Literals
243 --
244 procedure Simulate_Literal_Byte (b : Byte; sim : in out Machine_State; prob : in out MProb);
245 --
246 function Test_Simple_Literal (
247 b, b_match : Byte;
248 prob : CProb_array;
249 sim : Machine_State
250 )
251 return MProb;
252 --
253 function Test_Short_Rep_Match (sim : Machine_State) return MProb;
254 --
255 function Test_Literal_Byte (b : Byte; sim : Machine_State) return MProb;
256 --
257 -- Matches
258 --
259 function Test_Repeat_Match (
260 index_rm : Repeat_stack_range;
261 length : Unsigned;
262 sim : Machine_State
263 )
264 return MProb;
265 --
266 function Test_Simple_Match (
267 distance : UInt32;
268 length : Unsigned;
269 sim : Machine_State
270 )
271 return MProb;
272
273 -- End of the obvious cases. Now things get tougher...
274
275 -- Here we get the probability of a general DL code
276 -- as Write_any_DL_code would generate it, including variants.
277
278 procedure Simulate_any_DL_Code (
279 distance : UInt32;
280 length : Match_length_range;
281 sim : in out Machine_State;
282 prob : in out MProb;
283 recursion_limit : Natural
284 );
285
286 function Test_any_DL_Code (
287 distance : UInt32;
288 length : Match_length_range;
289 sim : Machine_State;
290 recursion_limit : Natural
291 )
292 return MProb;
293
294 -- Constants like 0.1234 appearing hereafter are empirical, tuned, magic numbers.
295 -- To do: tune them with Machine Learning.
296
297 -- Sometimes, a longer path like sending a simple match
298 -- instead of a repeat match has a lower modelled probability.
299 -- To avoid underusing repeat matches by letting their probabilities
300 -- being adapted lower over time, we penalize the simple match alternative.
301 Malus_simple_match_vs_rep : constant := 0.55;
302
303 package DL_Code_Erosion is
304 -- It is sometimes better to split a DL code as a very frequent literal,
305 -- then a very frequent DL code with length-1.
306 Lit_then_DL_threshold : constant := 0.875;
307 --
308 function Malus_lit_then_DL (distance : UInt32; length : Match_length_range) return MProb;
309 pragma Inline (Malus_lit_then_DL);
310 -- Case of DL code split into a shorter DL code, then a literal.
311 function DL_code_then_Literal (
312 distance : UInt32;
313 length : Match_length_range;
314 sim : Machine_State;
315 recursion_limit : Natural
316 )
317 return MProb;
318 end DL_Code_Erosion;
319
320 -- Here we define a generic DL code emission that is the same
321 -- for simulation and actual writes. This way, we don't need to
322 -- synchronize two pieces of Ada code doing the same operation,
323 -- one in simulation and the other in real.
324 --
325 generic
326 with procedure Simulated_or_actual_Literal_Byte (
327 b : Byte;
328 sim : in out Machine_State;
329 prob : in out MProb);
330 --
331 with procedure Simulated_or_actual_Strict_DL_Code (
332 distance : UInt32;
333 length : Match_length_range;
334 sim : in out Machine_State;
335 prob : in out MProb
336 );
337 --
338 I_am_a_simulation : Boolean;
339 procedure Generic_any_DL_Code (
340 distance : UInt32;
341 length : Match_length_range;
342 sim : in out Machine_State;
343 prob : in out MProb;
344 recursion_limit : Natural
345 );
346
347 end Estimates;
348
349 package body Estimates is
350 To_Prob_Factor : constant -- We compute the division (more expensive) at compile-time.
351 MProb := 1.0 / MProb'Base (probability_model_count);
352
353 function To_Math (cp : CProb) return MProb is
354 pragma Inline (To_Math);
355 begin
356 return MProb'Base (cp) * To_Prob_Factor;
357 end To_Math;
358
359 function Test_Bit_Encoding (prob_bit : CProb; symbol : Unsigned) return MProb is
360 pragma Inline (Test_Bit_Encoding);
361 b : constant MProb'Base := MProb'Base (symbol); -- b = 0.0 or 1.0
362 begin
363 return b + (1.0 - 2.0 * b) * To_Math (prob_bit);
364 -- Branch-less equivalent of:
365 -- if bit = 0 then
366 -- return prob_bit;
367 -- else
368 -- return 1.0 - prob_bit;
369 -- end if;
370 end Test_Bit_Encoding;
371
372 function Test_Simple_Literal (
373 b, b_match : Byte;
374 prob : CProb_array;
375 sim : Machine_State
376 ) return MProb
377 is
378 prob_lit : MProb := Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), Literal_choice);
379 symb : UInt32 := UInt32 (b) or 16#100#;
380 --
381 procedure Simulate_Literal is
382 begin
383 loop
384 prob_lit := prob_lit *
385 Test_Bit_Encoding (
386 prob_bit => prob (Integer (Shift_Right (symb, 8)) + prob'First),
387 symbol => Unsigned (Shift_Right (symb, 7)) and 1
388 );
389 symb := Shift_Left (symb, 1);
390 exit when symb >= 16#10000#;
391 end loop;
392 end Simulate_Literal;
393 --
394 procedure Simulate_Literal_Matched is
395 offs : UInt32 := 16#100#;
396 match : UInt32 := UInt32 (b_match);
397 begin
398 loop
399 match := Shift_Left (match, 1);
400 prob_lit := prob_lit *
401 Test_Bit_Encoding (
402 prob_bit => prob (Integer (offs + (match and offs) +
403 Shift_Right (symb, 8)) + prob'First),
404 symbol => Unsigned (Shift_Right (symb, 7)) and 1
405 );
406 symb := Shift_Left (symb, 1);
407 offs := offs and not (match xor symb);
408 exit when symb >= 16#10000#;
409 end loop;
410 end Simulate_Literal_Matched;
411 --
412 begin
413 if sim.state < 7 then
414 Simulate_Literal;
415 else
416 Simulate_Literal_Matched;
417 end if;
418 return prob_lit;
419 end Test_Simple_Literal;
420
421 function Test_Short_Rep_Match (sim : Machine_State) return MProb is
422 begin
423 return
424 Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), DL_code_choice) *
425 Test_Bit_Encoding (probs.switch.rep (sim.state), Rep_match_choice) *
426 Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_rep0_choice) *
427 Test_Bit_Encoding (probs.switch.rep0_long (sim.state, sim.pos_state), The_length_is_1_choice);
428 end Test_Short_Rep_Match;
429
430 -- We simulate here LZ77_emits_literal_byte.
431 procedure Simulate_Literal_Byte (b : Byte; sim : in out Machine_State; prob : in out MProb) is
432 probs_lit_idx : constant Integer := Idx_for_Literal_prob (sim.total_pos, sim.prev_byte);
433 ltr, srm : MProb;
434 procedure Update_pos_related_stuff is
435 begin
436 sim.R := (sim.R + 1) and Text_Buf_Mask;
437 sim.total_pos := sim.total_pos + 1;
438 sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
439 sim.prev_byte := b;
440 end Update_pos_related_stuff;
441 b_match : constant Byte := Text_Buf ((sim.R - sim.rep_dist (0) - 1) and Text_Buf_Mask);
442 begin
443 sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
444 ltr := Test_Simple_Literal (b, b_match, probs.lit (probs_lit_idx .. probs.lit'Last), sim);
445 if b = b_match and then sim.total_pos > Data_Bytes_Count (sim.rep_dist (0) + 1) then
446 srm := Test_Short_Rep_Match (sim);
447 if srm > ltr then
448 -- Short Rep would be preferred.
449 sim.state := Update_State_ShortRep (sim.state);
450 prob := prob * srm;
451 Update_pos_related_stuff;
452 return;
453 end if;
454 end if;
455 sim.state := Update_State_Literal (sim.state);
456 prob := prob * ltr;
457 Update_pos_related_stuff;
458 end Simulate_Literal_Byte;
459
460 function Test_Literal_Byte (b : Byte; sim : Machine_State) return MProb is
461 -- The following variable is discarded after the simulation,
462 -- since we only test the literal generation for getting its probability.
463 sim_var : Machine_State := sim;
464 prob : MProb := 1.0;
465 begin
466 Simulate_Literal_Byte (b, sim_var, prob);
467 return prob;
468 end Test_Literal_Byte;
469
470 function Simulate_Bit_Tree (prob : CProb_array; num_bits : Positive; symbol : Unsigned) return MProb is
471 res : MProb := 1.0;
472 bit, m : Unsigned;
473 begin
474 m := 1;
475 for i in reverse 0 .. num_bits - 1 loop
476 bit := Unsigned (Shift_Right (UInt32 (symbol), i)) and 1;
477 res := res * Test_Bit_Encoding (prob (Integer (m) + prob'First), bit);
478 m := 2 * m + bit;
479 end loop;
480 return res;
481 end Simulate_Bit_Tree;
482
483 function Test_Length (
484 probs_len : Probs_for_LZ_Lengths;
485 length : Unsigned;
486 sim_pos_state : Pos_state_range
487 )
488 return MProb
489 is
490 len : Unsigned := length - Min_match_length;
491 res : MProb;
492 begin
493 if len < Len_low_symbols then
494 res := Test_Bit_Encoding (probs_len.choice_1, 0) *
495 Simulate_Bit_Tree (probs_len.low_coder (sim_pos_state), Len_low_bits, len);
496 else
497 res := Test_Bit_Encoding (probs_len.choice_1, 1);
498 len := len - Len_low_symbols;
499 if len < Len_mid_symbols then
500 res := res * Test_Bit_Encoding (probs_len.choice_2, 0)
501 * Simulate_Bit_Tree (probs_len.mid_coder (sim_pos_state), Len_mid_bits, len);
502 else
503 res := res * Test_Bit_Encoding (probs_len.choice_2, 1);
504 len := len - Len_mid_symbols;
505 res := res * Simulate_Bit_Tree (probs_len.high_coder, Len_high_bits, len);
506 end if;
507 end if;
508 return res;
509 end Test_Length;
510
511 function Test_Repeat_Match (
512 index_rm : Repeat_stack_range;
513 length : Unsigned;
514 sim : Machine_State
515 )
516 return MProb
517 is
518 res : MProb := Test_Bit_Encoding (probs.switch.rep (sim.state), Rep_match_choice);
519 begin
520 case index_rm is
521 when 0 =>
522 res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_rep0_choice)
523 * Test_Bit_Encoding
524 (probs.switch.rep0_long (sim.state, sim.pos_state), The_length_is_not_1_choice);
525 when 1 =>
526 res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
527 * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_rep1_choice);
528 when 2 =>
529 res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
530 * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_not_rep1_choice)
531 * Test_Bit_Encoding (probs.switch.rep_g2 (sim.state), The_distance_is_rep2_choice);
532 when 3 =>
533 res := res * Test_Bit_Encoding (probs.switch.rep_g0 (sim.state), The_distance_is_not_rep0_choice)
534 * Test_Bit_Encoding (probs.switch.rep_g1 (sim.state), The_distance_is_not_rep1_choice)
535 * Test_Bit_Encoding (probs.switch.rep_g2 (sim.state), The_distance_is_not_rep2_choice);
536 end case;
537 return res * Test_Length (probs.rep_len, length, sim.pos_state);
538 end Test_Repeat_Match;
539
540 function Test_Simple_Match (
541 distance : UInt32;
542 length : Unsigned;
543 sim : Machine_State
544 )
545 return MProb
546 is
547 --
548 function Simulate_Bit_Tree_Reverse (prob : CProb_array; num_bits : Natural; symbol : UInt32)
549 return MProb
550 is
551 res : MProb := 1.0;
552 symb : UInt32 := symbol;
553 m : Unsigned := 1;
554 bit : Unsigned;
555 begin
556 for count_bits in reverse 1 .. num_bits loop
557 bit := Unsigned (symb) and 1;
558 res := res * Test_Bit_Encoding (prob (Integer (m) + prob'First), bit);
559 m := 2 * m + bit;
560 symb := Shift_Right (symb, 1);
561 end loop;
562 return res;
563 end Simulate_Bit_Tree_Reverse;
564 --
565 function Test_Distance return MProb is
566 len_state : constant Unsigned := Unsigned'Min (length - 2, len_to_pos_states - 1);
567 dist_slot : constant Unsigned := Get_dist_slot (distance);
568 base, dist_reduced : UInt32;
569 footerBits : Natural;
570 res : MProb;
571 begin
572 res := Simulate_Bit_Tree (probs.dist.slot_coder (len_state), Dist_slot_bits, dist_slot);
573 if dist_slot >= Start_dist_model_index then
574 footerBits := Natural (Shift_Right (UInt32 (dist_slot), 1)) - 1;
575 base := Shift_Left (UInt32 (2 or (dist_slot and 1)), footerBits);
576 dist_reduced := distance - base;
577 if dist_slot < End_dist_model_index then
578 res := res *
579 Simulate_Bit_Tree_Reverse (
580 probs.dist.pos_coder (Integer (base) - Integer (dist_slot) - 1 .. Pos_coder_range'Last),
581 footerBits,
582 dist_reduced
583 );
584 else
585 res := res *
586 (0.5 ** (footerBits - align_bits)) * -- direct bits
587 Simulate_Bit_Tree_Reverse (
588 probs.dist.align_coder,
589 align_bits,
590 dist_reduced and align_mask
591 );
592 end if;
593 end if;
594 return res;
595 end Test_Distance;
596 begin
597 return
598 Test_Bit_Encoding (probs.switch.rep (sim.state), Simple_match_choice) *
599 Test_Length (probs.len, length, sim.pos_state) *
600 Test_Distance;
601 end Test_Simple_Match;
602
603 -- We simulate here a Distance-Length code
604 -- sent straight to the encoder (no variants).
605 procedure Simulate_Strict_DL_Code (
606 distance : UInt32;
607 length : Match_length_range;
608 sim : in out Machine_State;
609 prob : in out MProb
610 )
611 is
612 pragma Inline (Simulate_Strict_DL_Code);
613 dist_ip : constant UInt32 := UInt32 (distance - 1); -- 7-Zip distance convention (minus 1)
614 found_repeat : Integer := Repeat_Stack'First - 1;
615 dlc : constant MProb := Test_Bit_Encoding (probs.switch.match (sim.state, sim.pos_state), DL_code_choice);
616 sma : constant MProb := Test_Simple_Match (dist_ip, Unsigned (length), sim);
617 rma : MProb;
618 aux : UInt32;
619 procedure Update_pos_related_stuff is
620 begin
621 sim.total_pos := sim.total_pos + Data_Bytes_Count (length);
622 sim.pos_state := Pos_state_range (UInt32 (sim.total_pos) and pos_bits_mask);
623 sim.R := (sim.R + UInt32 (length)) and Text_Buf_Mask; -- This is mod String_buffer_size
624 sim.prev_byte := Text_Buf ((sim.R - 1) and Text_Buf_Mask);
625 end Update_pos_related_stuff;
626 begin
627 for i in Repeat_Stack'Range loop
628 if dist_ip = sim.rep_dist (i) then
629 found_repeat := i;
630 exit;
631 -- NB: it's possible to pick the most probable duplicate instead, but without clear gain
632 end if;
633 end loop;
634 if found_repeat >= Repeat_Stack'First then
635 rma := Test_Repeat_Match (found_repeat, Unsigned (length), sim);
636 if rma >= sma * Malus_simple_match_vs_rep then
637 -- Repeat match case:
638 prob := prob * dlc * rma;
639 -- Roll the stack of recent distances up to the found item, which becomes the first one.
640 aux := sim.rep_dist (found_repeat);
641 for i in reverse 1 .. found_repeat loop
642 sim.rep_dist (i) := sim.rep_dist (i - 1);
643 end loop;
644 sim.rep_dist (0) := aux;
645 sim.state := Update_State_Rep (sim.state);
646 Update_pos_related_stuff;
647 return;
648 end if;
649 end if;
650 -- Simple match case:
651 prob := prob * dlc * sma;
652 -- Shift the stack of recent distances; the new distance becomes the first item.
653 for i in reverse 1 .. Repeat_stack_range'Last loop
654 sim.rep_dist (i) := sim.rep_dist (i - 1);
655 end loop;
656 sim.rep_dist (0) := dist_ip; -- 0-based distance.
657 sim.state := Update_State_Match (sim.state);
658 Update_pos_related_stuff;
659 end Simulate_Strict_DL_Code;
660
661 function Test_Strict_DL_Code (
662 distance : UInt32;
663 length : Match_length_range;
664 sim : Machine_State
665 )
666 return MProb
667 is
668 pragma Inline (Test_Strict_DL_Code);
669 -- The following variable is discarded after the simulation,
670 -- since we only test strict DL code for getting its probability.
671 sim_var : Machine_State := sim;
672 --
673 prob : MProb := 1.0;
674 begin
675 Simulate_Strict_DL_Code (distance, length, sim_var, prob);
676 return prob;
677 end Test_Strict_DL_Code;
678
679 -- Expand fully a DL code as a string of literals.
680 procedure Simulate_Expand_DL_code (
681 distance : UInt32;
682 length : Match_length_range;
683 give_up : MProb;
684 sim : in out Machine_State;
685 prob : in out MProb
686 )
687 is
688 pragma Inline (Simulate_Expand_DL_code);
689 b : Byte;
690 --
691 sim_mem : constant Machine_State := sim;
692 expanded_string_prob : MProb := 1.0;
693 Copy_start : constant UInt32 := (sim.R - distance) and Text_Buf_Mask;
694 begin
695 for x in 1 .. length loop
696 b := Text_Buf ((Copy_start + UInt32 (x - 1)) and Text_Buf_Mask);
697 Simulate_Literal_Byte (b, sim, expanded_string_prob);
698 -- Probability is decreasing over the loop, so it is
699 -- useless to continue under given threshold.
700 if expanded_string_prob < give_up then
701 sim := sim_mem;
702 exit;
703 end if;
704 sim.prev_byte := b;
705 end loop;
706 prob := prob * expanded_string_prob;
707 end Simulate_Expand_DL_code;
708
709 function Test_Expanded_DL_Code (
710 distance : UInt32;
711 length : Match_length_range;
712 give_up : MProb;
713 sim : Machine_State
714 )
715 return MProb
716 is
717 pragma Inline (Test_Expanded_DL_Code);
718 -- The following variable is discarded after the simulation,
719 -- since we only test the DL code expansion for getting its probability.
720 sim_var : Machine_State := sim;
721 --
722 prob : MProb := 1.0;
723 begin
724 Simulate_Expand_DL_code (distance, length, give_up, sim_var, prob);
725 return prob;
726 end Test_Expanded_DL_Code;
727
728 -- Case of a DL code split into two shorter DL codes.
729 procedure Test_Split_DL (
730 distance : UInt32;
731 length : Match_length_range;
732 sim : Machine_State;
733 hurdle : MProb;
734 recursion_limit : Natural;
735 best_prob : out MProb;
736 best_cut : out Match_length_range
737 );
738 pragma Inline (Test_Split_DL);
739
740 procedure Generic_any_DL_Code (
741 distance : UInt32;
742 length : Match_length_range;
743 sim : in out Machine_State;
744 prob : in out MProb;
745 recursion_limit : Natural
746 )
747 is
748 Copy_start : constant UInt32 := (sim.R - distance) and Text_Buf_Mask;
749 strict_dlc, expanded_dlc, strict_or_expanded_dlc, dlc_after_lit, head_lit : MProb;
750 b_head : Byte;
751 sim_post_lit_pos_state : Pos_state_range;
752 best_prob : MProb;
753 best_cut : Match_length_range;
754 new_recursion_limit : Integer;
755 begin
756 if I_am_a_simulation then
757 new_recursion_limit := recursion_limit - 1;
758 else
759 new_recursion_limit := recursion_limit; -- We do not limit in actual emission.
760 end if;
761 if new_recursion_limit < 0 then
762 Simulated_or_actual_Strict_DL_Code (distance, length, sim, prob);
763 return;
764 end if;
765 if compare_variants >= Simple then
766 strict_dlc := Test_Strict_DL_Code (distance, length, sim);
767 expanded_dlc := Test_Expanded_DL_Code (distance, length, strict_dlc, sim);
768 strict_or_expanded_dlc := MProb'Max (strict_dlc, expanded_dlc);
769 --
770 if length > Min_match_length then
771 b_head := Text_Buf (Copy_start and Text_Buf_Mask);
772 head_lit := Test_Literal_Byte (b_head, sim);
773 -- One literal, then a shorter DL code, case #1:
774 -- naive approach: we spot a super-probable literal.
775 if head_lit >= DL_Code_Erosion.Lit_then_DL_threshold then
776 Simulated_or_actual_Literal_Byte (b_head, sim, prob);
777 Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
778 return;
779 end if;
780 -- One literal, then a shorter DL code, case #2:
781 -- we estimate the shorter DL code's probability.
782 sim_post_lit_pos_state := Pos_state_range (UInt32 (sim.total_pos + 1) and pos_bits_mask);
783 dlc_after_lit :=
784 Test_any_DL_Code (
785 distance, length - 1,
786 (Update_State_Literal (sim.state), sim_post_lit_pos_state, b_head,
787 (sim.R + 1) and Text_Buf_Mask, sim.total_pos + 1, sim.rep_dist),
788 new_recursion_limit
789 );
790 if head_lit * dlc_after_lit * DL_Code_Erosion.Malus_lit_then_DL (distance, length)
791 > strict_or_expanded_dlc
792 then
793 Simulated_or_actual_Literal_Byte (b_head, sim, prob);
794 Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
795 return;
796 end if;
797 if DL_Code_Erosion.DL_code_then_Literal (distance, length, sim, new_recursion_limit)
798 > strict_or_expanded_dlc
799 then
800 -- We've got a better probability -> redo this variant
801 -- (shorter DL code, then literal) for good.
802 Generic_any_DL_Code (distance, length - 1, sim, prob, new_recursion_limit);
803 Simulated_or_actual_Literal_Byte (Text_Buf ((sim.R - distance) and Text_Buf_Mask), sim, prob);
804 return;
805 end if;
806 end if;
807 --
808 if expanded_dlc > strict_dlc then
809 -- Here we prefer a full expansion of DL code as literals.
810 for x in 1 .. length loop
811 Simulated_or_actual_Literal_Byte (
812 Text_Buf ((Copy_start + UInt32 (x - 1)) and Text_Buf_Mask), sim, prob
813 );
814 end loop;
815 return;
816 end if;
817 end if;
818 if compare_variants >= Splitting then
819 Test_Split_DL (
820 distance, length,
821 sim, strict_or_expanded_dlc, new_recursion_limit,
822 best_prob, best_cut
823 );
824 if best_prob > strict_or_expanded_dlc then
825 Generic_any_DL_Code (distance, best_cut, sim, prob, new_recursion_limit);
826 Generic_any_DL_Code (distance, length - best_cut, sim, prob, new_recursion_limit);
827 return;
828 end if;
829 end if;
830 -- At this point, we go for simulating or writing the plain DL code.
831 Simulated_or_actual_Strict_DL_Code (distance, length, sim, prob);
832 end Generic_any_DL_Code;
833
834 -- We simulate here Write_any_DL_code, including the variants!
835 procedure Simulate_any_DL_Code_Instance is new Generic_any_DL_Code
836 (Simulated_or_actual_Literal_Byte => Simulate_Literal_Byte,
837 Simulated_or_actual_Strict_DL_Code => Simulate_Strict_DL_Code,
838 I_am_a_simulation => True);
839
840 procedure Simulate_any_DL_Code (
841 distance : UInt32;
842 length : Match_length_range;
843 sim : in out Machine_State;
844 prob : in out MProb;
845 recursion_limit : Natural
846 )
847 renames Simulate_any_DL_Code_Instance;
848
849 function Test_any_DL_Code (
850 distance : UInt32;
851 length : Match_length_range;
852 sim : Machine_State;
853 recursion_limit : Natural
854 )
855 return MProb
856 is
857 -- The following variable is discarded after the simulation,
858 -- since we only test the DL code for getting its probability.
859 sim_var : Machine_State := sim;
860 --
861 prob : MProb := 1.0;
862 begin
863 Simulate_any_DL_Code (distance, length, sim_var, prob, recursion_limit);
864 return prob;
865 end Test_any_DL_Code;
866
867 package body DL_Code_Erosion is
868 --
869 function DL_code_then_Literal (
870 distance : UInt32;
871 length : Match_length_range;
872 sim : Machine_State;
873 recursion_limit : Natural
874 )
875 return MProb
876 is
877 -- The following variable is discarded after the simulation,
878 -- since we only test this variant for getting its probability.
879 sim_var : Machine_State := sim;
880 -- This "DL erosion" technique empirically works better for shorter distances and lengths.
881 Malus_DL_then_lit : constant MProb :=
882 MProb'Max (0.0, 0.135 - MProb'Base (distance) * 1.0e-8 - MProb'Base (length) * 1.0e-4);
883 --
884 prob : MProb := Malus_DL_then_lit;
885 begin
886 Simulate_any_DL_Code (distance, length - 1, sim_var, prob, recursion_limit);
887 Simulate_Literal_Byte (Text_Buf ((sim_var.R - distance) and Text_Buf_Mask), sim_var, prob);
888 return prob;
889 end DL_code_then_Literal;
890 --
891 function Malus_lit_then_DL (distance : UInt32; length : Match_length_range) return MProb is
892 begin
893 -- This "DL erosion" technique empirically works better for shorter distances and lengths.
894 return MProb'Max (0.0, 0.064 - MProb'Base (distance) * 1.0e-9 - MProb'Base (length) * 3.0e-5);
895 end Malus_lit_then_DL;
896 --
897 end DL_Code_Erosion;
898
899 subtype Splits_considered is Match_length_range range 4 .. 9;
900
901 procedure Test_Split_DL (
902 distance : UInt32;
903 length : Match_length_range;
904 sim : Machine_State;
905 hurdle : MProb;
906 recursion_limit : Natural;
907 best_prob : out MProb;
908 best_cut : out Match_length_range
909 )
910 is
911 sim_var : Machine_State := sim;
912 -- For long distances, the DL split technique degrades compression and makes
913 -- the compression time explode.
914 Malus : constant MProb :=
915 MProb'Max (0.0, 0.27 - MProb'Base (distance) * 2.0e-6);
916 prob : MProb;
917 lowered_recursion_limit : constant Natural := Integer'Max (0, recursion_limit - 1);
918 begin
919 best_prob := 0.0;
920 best_cut := Match_length_range'First;
921 if Malus < hurdle then
922 return;
923 end if;
924 for cut in 2 .. length - 2 loop -- If length < 4 this loop is skipped.
925 if cut in Splits_considered or else length - cut in Splits_considered then
926 -- If we test all lengths the compression becomes too slow
927 -- (huge number of combinations since recursion is involved).
928 prob := Malus;
929 sim_var := sim; -- Set or reset simulation state.
930 Simulate_any_DL_Code (distance, cut, sim_var, prob, lowered_recursion_limit);
931 if prob <= hurdle then
932 null;
933 -- Give up this iteration, since the probability is already below the required
934 -- level -> would be even lower after simulating the second DL code.
935 else
936 Simulate_any_DL_Code (distance, length - cut, sim_var, prob, lowered_recursion_limit);
937 if prob > best_prob then
938 best_prob := prob;
939 best_cut := cut;
940 end if;
941 end if;
942 end if;
943 end loop;
944 end Test_Split_DL;
945
946 end Estimates;
947
948 -------------------------------------
949 -- Range encoding of single bits. --
950 -------------------------------------
951
952 type Range_Encoder is record
953 width : UInt32 := 16#FFFF_FFFF#; -- (*)
954 low : UInt64 := 0; -- The current range is [low, low+width[
955 cache : Byte := 0;
956 cache_size : UInt64 := 1;
957 end record;
958 -- (*) "width" is called "range" in LZMA spec and "remaining width" in G.N.N. Martin's
959 -- article about range encoding.
960
961 range_enc : Range_Encoder;
962 encoded_uncompressed_bytes : UInt64 := 0;
963
964 procedure Shift_low is
965 -- Top 32 bits of the lower range bound.
966 lb_top32 : constant UInt64 := Shift_Right (range_enc.low, 32);
967 -- Bottom 32 bits of the lower range bound.
968 lb_bottom32 : constant UInt32 := UInt32 (range_enc.low and 16#FFFF_FFFF#);
969 temp, lb_bits_33_40 : Byte;
970 begin
971 if lb_bottom32 < 16#FF00_0000# or else lb_top32 /= 0 then
972 -- Flush range_enc.cache_size bytes, based on only
973 -- 2 byte values: range_enc.cache and lb_bits_33_40.
974 -- The mechanism is a bit obscure (seems to be a carry)...
975 temp := range_enc.cache;
976 lb_bits_33_40 := Byte (lb_top32 and 16#FF#);
977 loop
978 Write_Byte (temp + lb_bits_33_40); -- Finally a byte is output sometimes!
979 temp := 16#FF#;
980 range_enc.cache_size := range_enc.cache_size - 1;
981 exit when range_enc.cache_size = 0;
982 end loop;
983 range_enc.cache := Byte (Shift_Right (lb_bottom32, 24) and 16#FF#); -- bits 25 to 32
984 end if;
985 range_enc.cache_size := range_enc.cache_size + 1;
986 -- Bits 25 to 32 are erased and the trailing zeroes are added.
987 range_enc.low := UInt64 (Shift_Left (lb_bottom32, 8));
988 end Shift_low;
989
990 procedure Flush_range_encoder is
991 begin
992 for i in 1 .. 5 loop
993 Shift_low;
994 end loop;
995 end Flush_range_encoder;
996
997 -- Normalize corresponds to G.N.N. Martin's revised algorithm's adding
998 -- of trailing digits (zeroes). The leftmost digits of the range don't
999 -- change anymore and can be output.
1000 --
1001 procedure Normalize is
1002 pragma Inline (Normalize);
1003 begin
1004 if range_enc.width < width_threshold then
1005 range_enc.width := Shift_Left (range_enc.width, 8); -- Trailing zeroes are added to width.
1006 Shift_low;
1007 end if;
1008 end Normalize;
1009
1010 procedure Encode_Bit (prob : in out CProb; symbol : in Unsigned) is
1011 pragma Inline (Encode_Bit);
1012 cur_prob : constant CProb := prob; -- Local copy
1013 -- The current interval is [low, high=low+width[ .
1014 -- The bound is between 0 and width, closer to 0 if prob
1015 -- is small, closer to width if prob is large.
1016 bound : constant UInt32 := Shift_Right (range_enc.width, probability_model_bits) * UInt32 (cur_prob);
1017 begin
1018 if symbol = 0 then
1019 -- Left sub-interval, for symbol 0: [low, low+bound[ .
1020 -- Set new range. low is unchanged, high is new.
1021 range_enc.width := bound;
1022 Normalize;
1023 -- Increase probability.
1024 -- The truncation ensures that prob <= Probability_model_count - (2**m - 1). See note (*).
1025 prob := cur_prob + Shift_Right (probability_model_count - cur_prob, probability_change_bits);
1026 else
1027 -- Right sub-interval, for symbol 1: [low+bound, high=low+width[ .
1028 -- Set new range. low is new, high is unchanged.
1029 range_enc.low := range_enc.low + UInt64 (bound);
1030 range_enc.width := range_enc.width - bound;
1031 Normalize;
1032 -- Decrease probability: prob:= prob - {prob / 2**m}, approx. equal to prob * (1 - 2**m).
1033 -- The truncation represented by {} ensures that prob >= 2**m - 1. See note (*).
1034 prob := cur_prob - Shift_Right (cur_prob, probability_change_bits);
1035 end if;
1036 -- (*) It can be checked exhaustively that it is always the case.
1037 -- A too low prob could cause the width to be too small or even zero.
1038 -- Same for "too high". See LZMA sheet in za_work.xls.
1039 end Encode_Bit;
1040
1041 -----------------------------------------------------------------------------------
1042 -- This part processes the case where LZ77 sends a literal (a plain text byte) --
1043 -----------------------------------------------------------------------------------
1044
1045 procedure Write_Literal (prob : in out CProb_array; symbol : in UInt32) is
1046 pragma Inline (Write_Literal);
1047 symb : UInt32 := symbol or 16#100#;
1048 begin
1049 loop
1050 Encode_Bit ( -- Prob. offset is always 1, 2, 4, 8, .. , 128
1051 prob => prob (Integer (Shift_Right (symb, 8)) + prob'First),
1052 symbol => Unsigned (Shift_Right (symb, 7)) and 1
1053 );
1054 symb := Shift_Left (symb, 1);
1055 exit when symb >= 16#10000#;
1056 end loop;
1057 end Write_Literal;
1058
1059 procedure Write_Literal_Matched (prob : in out CProb_array; symbol, matched : in UInt32) is
1060 pragma Inline (Write_Literal_Matched);
1061 symb : UInt32 := symbol or 16#100#;
1062 offs : UInt32 := 16#100#;
1063 match : UInt32 := matched;
1064 begin
1065 loop
1066 match := Shift_Left (match, 1);
1067 Encode_Bit (
1068 prob => prob (Integer (offs + (match and offs) + Shift_Right (symb, 8)) + prob'First),
1069 symbol => Unsigned (Shift_Right (symb, 7)) and 1
1070 );
1071 symb := Shift_Left (symb, 1);
1072 offs := offs and not (match xor symb);
1073 exit when symb >= 16#10000#;
1074 end loop;
1075 end Write_Literal_Matched;
1076
1077 use type Estimates.MProb;
1078
1079 -- Encoder State: state of the real LZMA encoder - data is written here, no simulation!
1080 ES : Machine_State :=
1081 (R => 0,
1082 prev_byte => 0,
1083 total_pos => 0,
1084 rep_dist => (others => 0),
1085 state => 0,
1086 pos_state => 0
1087 );
1088
1089 max_recursion : constant := 2;
1090
1091 procedure Update_pos_state is
1092 pragma Inline (Update_pos_state);
1093 begin
1094 ES.pos_state := Pos_state_range (UInt32 (ES.total_pos) and pos_bits_mask);
1095 end Update_pos_state;
1096
1097 procedure LZ77_emits_literal_byte (b : Byte) is
1098 pb_lit_idx : constant Integer := Idx_for_Literal_prob (ES.total_pos, ES.prev_byte);
1099 b_match : constant Byte := Text_Buf ((ES.R - ES.rep_dist (0) - 1) and Text_Buf_Mask);
1100 begin
1101 if b = b_match and then ES.total_pos > Data_Bytes_Count (ES.rep_dist (0) + 1)
1102 and then
1103 (compare_variants = None
1104 or else
1105 Estimates.Test_Short_Rep_Match (ES) >
1106 Estimates.Test_Simple_Literal (b, b_match, probs.lit (pb_lit_idx .. probs.lit'Last), ES))
1107 then
1108 -- We are lucky: both bytes are the same. No literal to encode, "Short Rep Match"
1109 -- case, and its cost (4 bits) is more affordable than the literal's cost.
1110 Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1111 Encode_Bit (probs.switch.rep (ES.state), Rep_match_choice);
1112 Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_rep0_choice);
1113 Encode_Bit (probs.switch.rep0_long (ES.state, ES.pos_state), The_length_is_1_choice);
1114 ES.state := Update_State_ShortRep (ES.state);
1115 else
1116 Encode_Bit (probs.switch.match (ES.state, ES.pos_state), Literal_choice);
1117 if ES.state < 7 then
1118 Write_Literal (probs.lit (pb_lit_idx .. probs.lit'Last), UInt32 (b));
1119 else
1120 Write_Literal_Matched (probs.lit (pb_lit_idx .. probs.lit'Last), UInt32 (b), UInt32 (b_match));
1121 end if;
1122 ES.state := Update_State_Literal (ES.state);
1123 end if;
1124 ES.total_pos := ES.total_pos + 1;
1125 Update_pos_state;
1126 ES.prev_byte := b;
1127 Text_Buf (ES.R) := b;
1128 ES.R := (ES.R + 1) and Text_Buf_Mask; -- This is mod String_buffer_size
1129 encoded_uncompressed_bytes := encoded_uncompressed_bytes + 1;
1130 end LZ77_emits_literal_byte;
1131
1132 procedure Write_Literal_Byte (
1133 b : Byte;
1134 dummy_sim : in out Machine_State;
1135 dummy_prob : in out Estimates.MProb)
1136 is
1137 begin
1138 LZ77_emits_literal_byte (b);
1139 end Write_Literal_Byte;
1140
1141 ---------------------------------------------------------------------------------
1142 -- This part processes the case where LZ77 sends a Distance-Length (DL) code --
1143 ---------------------------------------------------------------------------------
1144
1145 procedure Bit_Tree_Encode (
1146 prob : in out CProb_array;
1147 num_bits : Positive;
1148 symbol : Unsigned)
1149 is
1150 bit, m : Unsigned;
1151 begin
1152 m := 1;
1153 for i in reverse 0 .. num_bits - 1 loop
1154 bit := Unsigned (Shift_Right (UInt32 (symbol), i)) and 1;
1155 Encode_Bit (prob (Integer (m) + prob'First), bit);
1156 m := 2 * m + bit;
1157 end loop;
1158 end Bit_Tree_Encode;
1159
1160 procedure Encode_Length (probs_len : in out Probs_for_LZ_Lengths; length : Unsigned) is
1161 len : Unsigned := length - Min_match_length;
1162 begin
1163 if len < Len_low_symbols then
1164 Encode_Bit (probs_len.choice_1, 0);
1165 -- LZ length in [2..9], i.e. len in [0..7]
1166 Bit_Tree_Encode (probs_len.low_coder (ES.pos_state), Len_low_bits, len);
1167 else
1168 Encode_Bit (probs_len.choice_1, 1);
1169 len := len - Len_low_symbols;
1170 if len < Len_mid_symbols then
1171 Encode_Bit (probs_len.choice_2, 0);
1172 -- LZ length in [10..17], i.e. len in [0..7]
1173 Bit_Tree_Encode (probs_len.mid_coder (ES.pos_state), Len_mid_bits, len);
1174 else
1175 Encode_Bit (probs_len.choice_2, 1);
1176 len := len - Len_mid_symbols;
1177 -- LZ length in [18..273], i.e. len in [0..255]
1178 Bit_Tree_Encode (probs_len.high_coder, Len_high_bits, len);
1179 end if;
1180 end if;
1181 end Encode_Length;
1182
1183 procedure Write_Simple_Match (dist_ip : UInt32; length : Unsigned) is
1184 --
1185 procedure Bit_Tree_Reverse_Encode (
1186 prob : in out CProb_array;
1187 num_bits : in Natural;
1188 symbol : in UInt32
1189 )
1190 is
1191 symb : UInt32 := symbol;
1192 m : Unsigned := 1;
1193 bit : Unsigned;
1194 begin
1195 for count_bits in reverse 1 .. num_bits loop
1196 bit := Unsigned (symb) and 1;
1197 Encode_Bit (prob (Integer (m) + prob'First), bit);
1198 m := 2 * m + bit;
1199 symb := Shift_Right (symb, 1);
1200 end loop;
1201 end Bit_Tree_Reverse_Encode;
1202
1203 -- Range encoding of num_bits with equiprobability.
1204 --
1205 procedure Encode_Direct_Bits (value : UInt32; num_bits : Natural) is
1206 begin
1207 for i in reverse 0 .. num_bits - 1 loop
1208 -- Bound is the half width. New width is halved anyway.
1209 range_enc.width := Shift_Right (range_enc.width, 1);
1210 -- Either low is unchanged (bit=0), or new low := old low + bound (bit=1).
1211 range_enc.low := range_enc.low +
1212 (UInt64 (range_enc.width) and (0 - UInt64 (Shift_Right (value, i) and 1)));
1213 Normalize;
1214 end loop;
1215 end Encode_Direct_Bits;
1216 --
1217 procedure Encode_Distance is
1218 len_state : constant Unsigned := Unsigned'Min (length - 2, len_to_pos_states - 1);
1219 dist_slot : constant Unsigned := Get_dist_slot (dist_ip);
1220 base, dist_reduced : UInt32;
1221 footerBits : Natural;
1222 begin
1223 Bit_Tree_Encode (probs.dist.slot_coder (len_state), Dist_slot_bits, dist_slot);
1224 if dist_slot >= Start_dist_model_index then
1225 footerBits := Natural (Shift_Right (UInt32 (dist_slot), 1)) - 1;
1226 base := Shift_Left (UInt32 (2 or (dist_slot and 1)), footerBits);
1227 dist_reduced := dist_ip - base;
1228 if dist_slot < End_dist_model_index then
1229 Bit_Tree_Reverse_Encode (
1230 probs.dist.pos_coder (Integer (base) - Integer (dist_slot) - 1 .. Pos_coder_range'Last),
1231 footerBits,
1232 dist_reduced
1233 );
1234 else
1235 Encode_Direct_Bits (Shift_Right (dist_reduced, align_bits), footerBits - align_bits);
1236 Bit_Tree_Reverse_Encode (
1237 probs.dist.align_coder,
1238 align_bits,
1239 dist_reduced and align_mask
1240 );
1241 end if;
1242 end if;
1243 end Encode_Distance;
1244 --
1245 begin
1246 Encode_Bit (probs.switch.rep (ES.state), Simple_match_choice);
1247 ES.state := Update_State_Match (ES.state);
1248 Encode_Length (probs.len, length);
1249 Encode_Distance;
1250 -- Shift the stack of recent distances; the new distance becomes the first item.
1251 for i in reverse 1 .. Repeat_stack_range'Last loop
1252 ES.rep_dist (i) := ES.rep_dist (i - 1);
1253 end loop;
1254 ES.rep_dist (0) := dist_ip;
1255 end Write_Simple_Match;
1256
1257 procedure Write_Repeat_Match (index_rm : Repeat_stack_range; length : Unsigned) is
1258 aux : UInt32;
1259 begin
1260 Encode_Bit (probs.switch.rep (ES.state), Rep_match_choice);
1261 case index_rm is
1262 when 0 =>
1263 Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_rep0_choice);
1264 Encode_Bit (probs.switch.rep0_long (ES.state, ES.pos_state), The_length_is_not_1_choice);
1265 when 1 =>
1266 Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1267 Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_rep1_choice);
1268 when 2 =>
1269 Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1270 Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_not_rep1_choice);
1271 Encode_Bit (probs.switch.rep_g2 (ES.state), The_distance_is_rep2_choice);
1272 when 3 =>
1273 Encode_Bit (probs.switch.rep_g0 (ES.state), The_distance_is_not_rep0_choice);
1274 Encode_Bit (probs.switch.rep_g1 (ES.state), The_distance_is_not_rep1_choice);
1275 Encode_Bit (probs.switch.rep_g2 (ES.state), The_distance_is_not_rep2_choice);
1276 end case;
1277 -- Roll the stack of recent distances up to the found item, which becomes the first one.
1278 aux := ES.rep_dist (index_rm);
1279 for i in reverse 1 .. index_rm loop
1280 ES.rep_dist (i) := ES.rep_dist (i - 1);
1281 end loop;
1282 ES.rep_dist (0) := aux;
1283 --
1284 Encode_Length (probs.rep_len, length);
1285 ES.state := Update_State_Rep (ES.state);
1286 end Write_Repeat_Match;
1287
1288 procedure Write_Strict_DL_Code (
1289 distance : UInt32;
1290 length : Match_length_range;
1291 dummy_sim : in out Machine_State;
1292 dummy_prob : in out Estimates.MProb
1293 )
1294 is
1295 dist_ip : constant UInt32 := UInt32 (distance - 1); -- 7-Zip distance convention (minus 1)
1296 found_repeat : Integer := Repeat_Stack'First - 1;
1297 begin
1298 pragma Assert (
1299 UInt64 (distance) <= encoded_uncompressed_bytes,
1300 "distance goes before input stream's begin"
1301 );
1302 Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1303 for i in Repeat_Stack'Range loop
1304 if dist_ip = ES.rep_dist (i) then
1305 found_repeat := i;
1306 exit;
1307 -- NB: it's possible to pick the most probable duplicate instead, but without clear gain
1308 end if;
1309 end loop;
1310 if found_repeat >= Repeat_Stack'First
1311 and then
1312 (compare_variants = None
1313 or else
1314 Estimates.Test_Repeat_Match (found_repeat, Unsigned (length), ES)
1315 >=
1316 Estimates.Test_Simple_Match (dist_ip, Unsigned (length), ES) *
1317 Estimates.Malus_simple_match_vs_rep
1318 )
1319 then
1320 Write_Repeat_Match (found_repeat, Unsigned (length));
1321 else
1322 Write_Simple_Match (dist_ip, Unsigned (length));
1323 end if;
1324 ES.total_pos := ES.total_pos + Data_Bytes_Count (length);
1325 Update_pos_state;
1326 ES.R := ES.R + UInt32 (length) and Text_Buf_Mask; -- This is mod String_buffer_size
1327 ES.prev_byte := Text_Buf ((ES.R - 1) and Text_Buf_Mask);
1328 end Write_Strict_DL_Code;
1329
1330 -- All the smart things to optimize the probability model by breaking
1331 -- DL codes is done in the following procedure:
1332 --
1333 procedure Write_any_DL_code is new Estimates.Generic_any_DL_Code
1334 (Simulated_or_actual_Literal_Byte => Write_Literal_Byte,
1335 Simulated_or_actual_Strict_DL_Code => Write_Strict_DL_Code,
1336 I_am_a_simulation => False);
1337
1338 procedure Expand_DL_Code_to_Buffer (
1339 sim : Machine_State;
1340 distance : Integer;
1341 length : Match_length_range
1342 )
1343 is
1344 Rx : UInt32 := sim.R;
1345 Copy_start : constant UInt32 := (sim.R - UInt32 (distance)) and Text_Buf_Mask;
1346 begin
1347 -- Expand early into the circular "text" buffer to have it up to date
1348 -- and available to simulations.
1349 for K in 0 .. UInt32 (length - 1) loop
1350 Text_Buf (Rx) := Text_Buf ((Copy_start + K) and Text_Buf_Mask);
1351 Rx := (Rx + 1) and Text_Buf_Mask; -- This is mod String_buffer_size
1352 end loop;
1353 end Expand_DL_Code_to_Buffer;
1354
1355 procedure LZ77_emits_DL_code (distance : Integer; length : Match_length_range) is
1356 dummy_prob : Estimates.MProb := 0.0;
1357 begin
1358 Expand_DL_Code_to_Buffer (ES, distance, length);
1359 encoded_uncompressed_bytes := encoded_uncompressed_bytes + UInt64 (length);
1360 Write_any_DL_code (UInt32 (distance), length, ES, dummy_prob, max_recursion);
1361 end LZ77_emits_DL_code;
1362
1363 procedure Estimate_DL_Codes_for_LZ77 (
1364 matches : in out LZ77.Matches_Array;
1365 old_match_index : in Natural;
1366 prefixes : in LZ77.Byte_Array;
1367 best_score_index : out Positive;
1368 best_score_set : out LZ77.Prefetch_Index_Type;
1369 match_trace : out LZ77.DLP_Array
1370 )
1371 is
1372 pragma Unreferenced (match_trace);
1373 use Estimates;
1374 last_pos_any_DL : Natural := 0;
1375 sim_new : Machine_State := ES;
1376 offset_new_match_set : constant array (Boolean) of Natural := (0, 1);
1377 head_lit_prob : MProb;
1378 --
1379 -- Compare different ways of sending DL codes starting with
1380 -- DL (i), with a total length 'last_pos_any_DL', in order to
1381 -- compare sequences of the same length starting with the different
1382 -- matches in DL_old and DL_new. The matches in DL_new are preceded
1383 -- by the literal 'head_literal_new'.
1384 --
1385 procedure Scoring (
1386 state : Machine_State;
1387 start : Positive;
1388 recursion_level : Positive;
1389 prob : out MProb;
1390 index : out Positive;
1391 match_set : out LZ77.Prefetch_Index_Type
1392 )
1393 is
1394 -- We compute the probability of the message sent for the bytes
1395 -- at position 'start' to the position 'last_pos_any_DL' and find the
1396 -- optimal combination using the matches in the DL array.
1397 prob_i, tail_prob : MProb;
1398 test_state : Machine_State;
1399 length_trunc, some_index : Positive;
1400 some_match_set : LZ77.Prefetch_Index_Type;
1401 last_pos_i : Integer;
1402 begin
1403 prob := 0.0;
1404 for m in matches'Range loop
1405 for i in 1 .. matches (m).count loop
1406 last_pos_i := matches (m).dl (i).length + offset_new_match_set (m /= old_match_index);
1407 if last_pos_i >= start then
1408 if last_pos_i < last_pos_any_DL and recursion_level >= 2 then
1409 -- Skip case requiring insane recursion level: level = number of DL codes chained!
1410 null;
1411 else
1412 if m /= old_match_index and then start = 1 then
1413 test_state := sim_new; -- Shortcut to avoid resimulating the head literal.
1414 prob_i := head_lit_prob;
1415 else
1416 test_state := state; -- Clone the current state (general case including recursion).
1417 prob_i := 1.0;
1418 end if;
1419 --
1420 -- 'length_trunc' is what remains of the DL code, DL (i), to be consumed.
1421 --
1422 if m = old_match_index then
1423 -- Easy case: we execute one of the "old" matches.
1424 length_trunc := matches (m).dl (i).length - start + 1; -- always >= 1
1425 elsif start = 1 then
1426 -- We execute the full new DL code after the head literal.
1427 length_trunc := matches (m).dl (i).length;
1428 else -- start >= 2. (2: full DL, 3: truncate by 1, etc.)
1429 length_trunc := matches (m).dl (i).length - start + 2; -- always >= 1
1430 end if;
1431 pragma Assert (length_trunc >= 1);
1432 --
1433 if length_trunc = 1 then
1434 -- Just simulate the literal we are sitting on: the buffer
1435 -- has been already filled via Expand_DL_Code_to_Buffer.
1436 -- It is a shortcut for and should be equivalent to the position checked below.
1437 pragma Assert (
1438 Text_Buf (state.R) =
1439 Text_Buf ((state.R - UInt32 (matches (m).dl (i).distance)) and Text_Buf_Mask),
1440 "Bytes of simulated copy do not match; start =" & Integer'Image (start) &
1441 "; DL code distance=" & LZ77.Distance_Type'Image (matches (m).dl (i).distance) &
1442 "; new match set=" & Boolean'Image (m /= old_match_index)
1443 );
1444 Simulate_Literal_Byte (
1445 Text_Buf (state.R),
1446 test_state,
1447 prob_i);
1448 else -- Here, length_trunc >= 2
1449 Simulate_any_DL_Code (
1450 distance => UInt32 (matches (m).dl (i).distance),
1451 length => length_trunc,
1452 sim => test_state,
1453 prob => prob_i,
1454 recursion_limit => 1);
1455 end if;
1456 if last_pos_i < last_pos_any_DL then
1457 Scoring (test_state, last_pos_i + 1, recursion_level + 1, tail_prob, some_index, some_match_set);
1458 prob_i := prob_i * tail_prob;
1459 end if;
1460 if prob_i > prob then
1461 prob := prob_i;
1462 index := i;
1463 match_set := m;
1464 end if;
1465 end if;
1466 end if;
1467 end loop;
1468 end loop;
1469 end Scoring;
1470 --
1471 best_prob : MProb;
1472 new_wins : Boolean := False;
1473 last_pos_single_DL : Natural;
1474 match_for_max_last_pos : LZ77.Distance_Length_Pair;
1475 sim_expand : Machine_State := ES;
1476 sim_old : constant Machine_State := ES;
1477 begin
1478 for m in matches'Range loop
1479 for i in 1 .. matches (m).count loop
1480 last_pos_single_DL := matches (m).dl (i).length + offset_new_match_set (m /= old_match_index);
1481 if last_pos_single_DL > last_pos_any_DL then
1482 last_pos_any_DL := last_pos_single_DL;
1483 match_for_max_last_pos := matches (m).dl (i);
1484 new_wins := m /= old_match_index;
1485 end if;
1486 end loop;
1487 end loop;
1488 if new_wins then -- Copy the literal to the buffer.
1489 Text_Buf (sim_expand.R) := prefixes (1);
1490 sim_expand.R := (sim_expand.R + 1) and Text_Buf_Mask;
1491 end if;
1492 Expand_DL_Code_to_Buffer (sim_expand, match_for_max_last_pos.distance, match_for_max_last_pos.length);
1493 --
1494 head_lit_prob := 1.0;
1495 Simulate_Literal_Byte (prefixes (1), sim_new, head_lit_prob);
1496 --
1497 Scoring (sim_old, 1, 1, best_prob, best_score_index, best_score_set);
1498 end Estimate_DL_Codes_for_LZ77;
1499
1500 procedure My_LZ77 is
1501 new LZ77.Encode
1502 (String_buffer_size => String_buffer_size (level),
1503 Look_Ahead => Max_length (level),
1504 Threshold => Min_length (level) - 1,
1505 Method => LZ77_choice (level),
1506 Read_Byte => Read_Byte,
1507 More_Bytes => More_Bytes,
1508 Write_Literal => LZ77_emits_literal_byte,
1509 Write_DL_Code => LZ77_emits_DL_code,
1510 Estimate_DL_Codes => Estimate_DL_Codes_for_LZ77
1511 );
1512
1513 procedure Write_LZMA_header is
1514 dw : UInt32 := params.dict_size;
1515 uw : Data_Bytes_Count := params.unpack_size;
1516 begin
1517 -- 5-byte header
1518 Write_Byte (Byte (params.lc + 9 * params.lp + 9 * 5 * params.pb));
1519 for i in 0 .. 3 loop
1520 Write_Byte (Byte (dw mod 256));
1521 dw := dw / 256;
1522 end loop;
1523 -- 8 bytes for unpacked size.
1524 -- This part of the header is optional => you need a
1525 -- prior knowledge or a "pre-header" indicating its presence or not.
1526 if params.header_has_size then
1527 for i in 0 .. 7 loop
1528 if params.unpack_size_defined then
1529 Write_Byte (Byte (uw mod 256));
1530 uw := uw / 256;
1531 else
1532 Write_Byte (16#FF#);
1533 end if;
1534 end loop;
1535 end if;
1536 end Write_LZMA_header;
1537
1538 begin
1539 case level is
1540 when Level_0 | Level_1 =>
1541 compare_variants := None;
1542 when Level_2 =>
1543 compare_variants := Simple;
1544 when Level_3 =>
1545 compare_variants := Splitting;
1546 end case;
1547 Write_LZMA_header;
1548 My_LZ77;
1549 if params.has_end_mark then
1550 -- The end-of-stream marker is a fake "Simple Match" with a special distance.
1551 Encode_Bit (probs.switch.match (ES.state, ES.pos_state), DL_code_choice);
1552 Write_Simple_Match (
1553 dist_ip => end_of_stream_magic_distance,
1554 length => Min_match_length
1555 );
1556 end if;
1557 Flush_range_encoder;
1558 Dispose (Text_Buf);
1559 exception
1560 when others =>
1561 Dispose (Text_Buf);
1562 raise;
1563 end Encode;
1564
1565 end LZMA.Encoding;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.