Source file : lz77.adb
1 -- There are four LZ77 encoders at choice in this package:
2 --
3 -- 1/ LZ77_using_LZHuf, based on LZHuf by Haruhiko Okumura and Haruyasu Yoshizaki.
4 --
5 -- 2/ LZ77_using_IZ, based on Info-Zip's Zip's deflate.c by Jean-Loup Gailly.
6 -- deflate.c is actually the LZ77 part of Info-Zip's compression.
7 --
8 -- 3/ LZ77_using_BT4, based on LZMA SDK's BT4 algorithm by Igor Pavlov.
9 --
10 -- 4/ LZ77_by_Rich, based on PROG2.C by Rich Geldreich, Jr.
11 --
12 -- Variant 1/, LZ77_using_LZHuf, is working since 2009. Two problems: it is slow
13 -- and not well adapted to the Deflate format (mediocre compression).
14 --
15 -- Variant 2/, LZ77_using_IZ, is much faster, and better suited for Deflate.
16 -- It has been added on 05-Mar-2016.
17 -- The code is tailored and optimized for a single set of
18 -- the String_buffer_size, Look_Ahead, Threshold LZ77 parameters - those for Deflate.
19 --
20 -- Variant 3/, LZ77_using_BT4, was added on 06-Sep-2016.
21 -- The seems to be the best match finder for LZMA on data of the >= 1 MiB scale.
22
23 -- To do:
24 --
25 -- 2/
26 -- - LZ77 / IZ: similar to the test with TOO_FAR, try to cluster distances around
27 -- values needing less extra bits (may not work at all...)
28 -- - LZ77 / IZ: tune TOO_FAR (max: 32767), see http://optipng.sf.net/pngtech/too_far.html
29 -- "TOO_FAR in zlib Is Not Too Far" for discussion
30
31 -- Legal licensing note:
32
33 -- Copyright (c) 2016 .. 2020 Gautier de Montmollin (maintainer of the Ada version)
34 -- SWITZERLAND
35
36 -- Permission is hereby granted, free of charge, to any person obtaining a copy
37 -- of this software and associated documentation files (the "Software"), to deal
38 -- in the Software without restriction, including without limitation the rights
39 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
40 -- copies of the Software, and to permit persons to whom the Software is
41 -- furnished to do so, subject to the following conditions:
42
43 -- The above copyright notice and this permission notice shall be included in
44 -- all copies or substantial portions of the Software.
45
46 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
51 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
52 -- THE SOFTWARE.
53
54 -- NB: this is the MIT License, as found 21-Aug-2016 on the site
55 -- http://www.opensource.org/licenses/mit-license.php
56
57 with Ada.Text_IO, Ada.Integer_Text_IO;
58 with Ada.Unchecked_Deallocation;
59 with Interfaces; use Interfaces;
60 with System;
61
62 package body LZ77 is
63
64 -- System.Word_Size: 13.3(8): A word is the largest amount of storage
65 -- that can be conveniently and efficiently manipulated by the hardware,
66 -- given the implementation's run-time model.
67 --
68 min_bits_32 : constant := Integer'Max (32, System.Word_Size);
69 min_bits_16 : constant := Integer'Max (16, System.Word_Size);
70
71 -- We define an Integer type which is at least 32 bits, but n bits
72 -- on a native n (> 32) bits architecture (no performance hit on 64+
73 -- bits architectures).
74 -- Integer_M16 not needed: Integer already guarantees 16 bits
75 --
76 type Integer_M32 is range -2**(min_bits_32 - 1) .. 2**(min_bits_32 - 1) - 1;
77 subtype Natural_M32 is Integer_M32 range 0 .. Integer_M32'Last;
78 -- subtype Positive_M32 is Integer_M32 range 1 .. Integer_M32'Last;
79
80 type Unsigned_M16 is mod 2**min_bits_16;
81 type Unsigned_M32 is mod 2**min_bits_32;
82
83 function Are_Matches_Sorted (m : Matches_Type) return Boolean is
84 begin
85 for i in 2 .. m.count loop
86 if m.dl (i).length < m.dl (i - 1).length then
87 return False;
88 end if;
89 end loop;
90 return True;
91 end Are_Matches_Sorted;
92
93 procedure Encode is
94
95 -----------------------
96 -- LZHuf algorithm --
97 -----------------------
98
99 procedure LZ77_using_LZHuf is
100 -- Based on LZHUF by OKUMURA & YOSHIZAKI.
101 -- Here the adaptive Huffman coding is thrown away:
102 -- algorithm is used only to find matching patterns.
103
104 N_Char : constant Integer := 256 - Threshold + Look_Ahead;
105 -- Character code (= 0..N_CHAR-1)
106 Max_Table : constant Integer := N_Char * 2 - 1;
107
108 type Text_Buffer is array (0 .. String_buffer_size + Look_Ahead - 1) of Byte;
109 empty_buffer : constant Text_Buffer := (others => 32); -- ' '
110
111 -- > The Huffman frequency handling is made generic so we have
112 -- one copy of the tree and of the frequency table for Encode
113 -- and one for Decode
114
115 generic
116 package Huffman is
117 --- Pointing parent nodes.
118 --- Area [Max_Table..(Max_Table + N_CHAR - 1)] are pointers for leaves
119 Parent : array (0 .. Max_Table + N_Char - 1) of Natural;
120 --- Pointing children nodes (son[], son[] + 1)
121 Son : array (0 .. Max_Table - 1) of Natural;
122
123 Root_Position : constant Integer := Max_Table - 1; -- (can be always Son'last ?)
124
125 procedure Start;
126 procedure Update_Freq_Tree (C0 : Natural);
127 end Huffman;
128
129 package body Huffman is
130
131 Freq : array (0 .. Max_Table) of Natural; -- Cumulative freq table
132
133 Max_Freq : constant := 16#8000#;
134 -- ^-- update when cumulative frequency reaches to this value
135
136 procedure Start is
137 I : Natural;
138 begin
139 for J in 0 .. N_Char - 1 loop
140 Freq (J) := 1;
141 Son (J) := J + Max_Table;
142 Parent (J + Max_Table) := J;
143 end loop;
144
145 I := 0;
146 for J in N_Char .. Root_Position loop
147 Freq (J) := Freq (I) + Freq (I + 1);
148 Son (J) := I;
149 Parent (I) := J;
150 Parent (I + 1) := J;
151 I := I + 2;
152 end loop;
153
154 Freq (Freq'Last) := 16#FFFF#; -- ( Max_Table )
155 Parent (Root_Position) := 0;
156 end Start;
157
158 procedure Update_Freq_Tree (C0 : Natural) is
159
160 procedure Reconstruct_Freq_Tree is
161 I, J, K, F, L : Natural;
162 begin
163 -- Halven cumulative freq for leaf nodes
164 J := 0;
165 for I in 0 .. Root_Position loop
166 if Son (I) >= Max_Table then
167 Freq (J) := (Freq (I) + 1) / 2;
168 Son (J) := Son (I);
169 J := J + 1;
170 end if;
171 end loop;
172
173 -- Make a tree : first, connect children nodes
174 I := 0;
175 for J in N_Char .. Root_Position loop -- J : free nodes
176 K := I + 1;
177 F := Freq (I) + Freq (K); -- new frequency
178 Freq (J) := F;
179 K := J - 1;
180 while F < Freq (K) loop
181 K := K - 1;
182 end loop;
183
184 K := K + 1;
185 L := J - K; -- 2007: fix: was L:= (J-K)*2, memcopy parameter remain
186
187 Freq (K + 1 .. K + L) := Freq (K .. K + L - 1); -- shift by one cell right
188 Freq (K) := F;
189 Son (K + 1 .. K + L) := Son (K .. K + L - 1); -- shift by one cell right
190 Son (K) := I;
191 I := I + 2;
192 end loop;
193
194 -- Connect parent nodes
195 for I in 0 .. Max_Table - 1 loop
196 K := Son (I);
197 Parent (K) := I;
198 if K < Max_Table then
199 Parent (K + 1) := I;
200 end if;
201 end loop;
202
203 end Reconstruct_Freq_Tree;
204
205 C, I, J, K, L : Natural;
206
207 begin -- Update_Freq_Tree;
208 if Freq (Root_Position) = Max_Freq then
209 Reconstruct_Freq_Tree;
210 end if;
211 C := Parent (C0 + Max_Table);
212 loop
213 Freq (C) := Freq (C) + 1;
214 K := Freq (C);
215 -- Swap nodes to keep the tree freq-ordered
216 L := C + 1;
217 if K > Freq (L) then
218 while K > Freq (L + 1) loop
219 L := L + 1;
220 end loop;
221
222 Freq (C) := Freq (L);
223 Freq (L) := K;
224
225 I := Son (C);
226 Parent (I) := L;
227 if I < Max_Table then
228 Parent (I + 1) := L;
229 end if;
230
231 J := Son (L);
232 Son (L) := I;
233
234 Parent (J) := C;
235 if J < Max_Table then
236 Parent (J + 1) := C;
237 end if;
238 Son (C) := J;
239
240 C := L;
241 end if;
242 C := Parent (C);
243 exit when C = 0;
244 end loop; -- do it until reaching the root
245 end Update_Freq_Tree;
246
247 end Huffman;
248
249 Node_Nil : constant Integer := String_buffer_size; -- End of tree's node
250
251 Lson, Dad : array (0 .. String_buffer_size) of Natural;
252 Rson : array (0 .. String_buffer_size + 256) of Natural;
253
254 procedure Init_Tree is
255 begin
256 for I in String_buffer_size + 1 .. Rson'Last loop
257 Rson (I) := Node_Nil;
258 end loop; -- root
259 for I in 0 .. String_buffer_size - 1 loop
260 Dad (I) := Node_Nil;
261 end loop; -- node
262 end Init_Tree;
263
264 Match_Position : Natural;
265 Match_Length : Natural;
266
267 Text_Buf : Text_Buffer := empty_buffer;
268
269 procedure Insert_Node (R : Integer) is
270 I, P : Integer;
271 Geq : Boolean := True;
272 C : Natural;
273 begin
274 P := String_buffer_size + 1 + Integer (Text_Buf (R));
275 Rson (R) := Node_Nil;
276 Lson (R) := Node_Nil;
277 Match_Length := 0;
278 loop
279 if Geq then
280 if Rson (P) = Node_Nil then
281 Rson (P) := R;
282 Dad (R) := P;
283 return;
284 end if;
285 P := Rson (P);
286 else
287 if Lson (P) = Node_Nil then
288 Lson (P) := R;
289 Dad (R) := P;
290 return;
291 end if;
292 P := Lson (P);
293 end if;
294 I := 1;
295 while I < Look_Ahead and then Text_Buf (R + I) = Text_Buf (P + I) loop
296 I := I + 1;
297 end loop;
298
299 Geq := Text_Buf (R + I) >= Text_Buf (P + I) or I = Look_Ahead;
300
301 if I > Threshold then
302 if I > Match_Length then
303 Match_Position := (R - P) mod String_buffer_size - 1;
304 Match_Length := I;
305 exit when Match_Length >= Look_Ahead;
306 end if;
307 if I = Match_Length then
308 C := (R - P) mod String_buffer_size - 1;
309 if C < Match_Position then
310 Match_Position := C;
311 end if;
312 end if;
313 end if;
314 end loop;
315
316 Dad (R) := Dad (P);
317 Lson (R) := Lson (P);
318 Rson (R) := Rson (P);
319 Dad (Lson (P)) := R;
320 Dad (Rson (P)) := R;
321 if Rson (Dad (P)) = P then
322 Rson (Dad (P)) := R;
323 else
324 Lson (Dad (P)) := R;
325 end if;
326 Dad (P) := Node_Nil; -- remove P
327 end Insert_Node;
328
329 procedure Delete_Node (P : Natural) is
330 Q : Natural;
331 begin
332 if Dad (P) = Node_Nil then -- unregistered
333 return;
334 end if;
335 if Rson (P) = Node_Nil then
336 Q := Lson (P);
337 elsif Lson (P) = Node_Nil then
338 Q := Rson (P);
339 else
340 Q := Lson (P);
341 if Rson (Q) /= Node_Nil then
342 loop
343 Q := Rson (Q);
344 exit when Rson (Q) = Node_Nil;
345 end loop;
346 Rson (Dad (Q)) := Lson (Q);
347 Dad (Lson (Q)) := Dad (Q);
348 Lson (Q) := Lson (P);
349 Dad (Lson (P)) := Q;
350 end if;
351 Rson (Q) := Rson (P);
352 Dad (Rson (P)) := Q;
353 end if;
354 Dad (Q) := Dad (P);
355 if Rson (Dad (P)) = P then
356 Rson (Dad (P)) := Q;
357 else
358 Lson (Dad (P)) := Q;
359 end if;
360 Dad (P) := Node_Nil;
361 end Delete_Node;
362
363 package Huffman_E is new Huffman;
364
365 I, R, S, Last_Match_Length : Natural;
366 Len : Integer;
367 C : Byte;
368 begin
369 if not More_Bytes then
370 return;
371 end if;
372 Huffman_E.Start;
373 Init_Tree;
374 S := 0;
375 R := String_buffer_size - Look_Ahead;
376 Len := 0;
377 while Len < Look_Ahead and More_Bytes loop
378 Text_Buf (R + Len) := Read_Byte;
379 Len := Len + 1;
380 end loop;
381
382 -- Seems: fill dictionary with default value
383 --
384 -- for I in 1.. Look_Ahead loop
385 -- Insert_Node(R - I);
386 -- end loop;
387
388 Insert_Node (R);
389
390 loop
391 if Match_Length > Len then
392 Match_Length := Len;
393 end if;
394 if Match_Length <= Threshold then
395 Match_Length := 1;
396 Huffman_E.Update_Freq_Tree (Natural (Text_Buf (R)));
397 Write_Literal (Text_Buf (R));
398 else
399 Write_DL_Code (Match_Position + 1, Match_Length);
400 end if;
401 Last_Match_Length := Match_Length;
402 I := 0;
403 while I < Last_Match_Length and More_Bytes loop
404 I := I + 1;
405 Delete_Node (S);
406 C := Read_Byte;
407 Text_Buf (S) := C;
408 if S < Look_Ahead - 1 then
409 Text_Buf (S + String_buffer_size) := C;
410 end if;
411 S := (S + 1) mod String_buffer_size;
412 R := (R + 1) mod String_buffer_size;
413 Insert_Node (R);
414 end loop;
415
416 while I < Last_Match_Length loop
417 I := I + 1;
418 Delete_Node (S);
419 S := (S + 1) mod String_buffer_size;
420 R := (R + 1) mod String_buffer_size;
421 Len := Len - 1;
422 if Len > 0 then
423 Insert_Node (R);
424 end if;
425 end loop;
426
427 exit when Len = 0;
428 end loop;
429 end LZ77_using_LZHuf;
430
431 --------------------------
432 -- Info-Zip algorithm --
433 --------------------------
434
435 -- LZ77_using_IZ: based on deflate.c by Jean-Loup Gailly.
436 -- Core description of the algorithm:
437 --
438 -- The most straightforward technique turns out to be the fastest for
439 -- most input files: try all possible matches and select the longest.
440 -- The key feature of this algorithm is that insertions into the string
441 -- dictionary are very simple and thus fast, and deletions are avoided
442 -- completely. Insertions are performed at each input character, whereas
443 -- string matches are performed only when the previous match ends. So it
444 -- is preferable to spend more time in matches to allow very fast string
445 -- insertions and avoid deletions. The matching algorithm for small
446 -- strings is inspired from that of Rabin & Karp [1]. A brute force approach
447 -- is used to find longer strings when a small match has been found.
448 --
449 -- The idea of lazy evaluation of matches is due to Jan-Mark Wams.
450 --
451 -- [1] A description of the Rabin and Karp algorithm is given in the book
452 -- "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
453 --
454 -- About hashing: chapter 6.4 of The Art of Computer Programming, Volume 3, D.E. Knuth
455 -- Rabin and Karp algorithm: http://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
456
457 -- Compression level: 0: store, 1: best speed, 9: best compression, 10: variant of level 9
458 -- Ada code: only levels 4 to 10 are supported.
459
460 procedure LZ77_using_IZ (level : Natural) is
461 HASH_BITS : constant := 15; -- 13..15
462 HASH_SIZE : constant := 2 ** HASH_BITS;
463 HASH_MASK : constant := HASH_SIZE - 1;
464 WSIZE : constant Integer_M32 := Integer_M32 (String_buffer_size);
465 WMASK : constant Unsigned_M16 := Unsigned_M16 (WSIZE - 1);
466 -- HASH_SIZE and WSIZE must be powers of two
467 NIL : constant := 0; -- Tail of hash chains
468 TOO_FAR : constant := 4096; -- Matches of length 3 are discarded if their distance exceeds TOO_FAR
469 --
470 subtype ulg is Unsigned_M32;
471 subtype unsigned is Unsigned_M16;
472 subtype ush is Unsigned_M16;
473 -- subtype long is Integer_M32;
474 -- subtype int is Integer;
475 subtype Pos is Unsigned_M32; -- must be at least 32 bits
476 -- subtype IPos is unsigned;
477 -- A Pos is an index in the character window. IPos is used only for parameter passing.
478 window : array (0 .. 2 * WSIZE - 1) of Byte;
479 -- Sliding window. Input bytes are read into the second half of the window,
480 -- and move to the first half later to keep a dictionary of at least WSIZE
481 -- bytes. With this organization, matches are limited to a distance of
482 -- WSIZE-MAX_MATCH bytes, but this ensures that IO is always
483 -- performed with a length multiple of the block size.
484 prev : array (0 .. unsigned (WSIZE - 1)) of Pos;
485 -- Link to older string with same hash index.
486 -- This link is maintained only for the last 32K strings.
487 -- An index in this array is thus a window index modulo 32K.
488 head : array (0 .. unsigned (HASH_SIZE - 1)) of Pos;
489 -- Heads of the hash chains or NIL.
490 window_size : ulg;
491 -- window size, 2*WSIZE except for MMAP or BIG_MEM, where it is the
492 -- input file length plus MIN_LOOKAHEAD.
493 sliding : Boolean; -- Set to False when the input file is already in memory [was: int]
494 ins_h : unsigned; -- hash index of string to be inserted
495 MIN_MATCH : constant Integer_M32 := Integer_M32 (Threshold) + 1; -- Deflate: 3
496 MAX_MATCH : constant Integer_M32 := Integer_M32 (Look_Ahead); -- Deflate: 258
497 -- Minimum amount of lookahead, except at the end of the input file.
498 MIN_LOOKAHEAD : constant Integer_M32 := MAX_MATCH + MIN_MATCH + 1; -- Deflate: 262
499 -- This LZ77 compression doesn't use the full possible distance range: 32507..32768 unused!
500 MAX_DIST : constant Integer_M32 := WSIZE - MIN_LOOKAHEAD; -- Deflate: 32506
501 H_SHIFT : constant Integer := Integer ((HASH_BITS + MIN_MATCH - 1) / MIN_MATCH);
502 -- Number of bits by which ins_h and del_h must be shifted at each
503 -- input step. It must be such that after MIN_MATCH steps, the oldest
504 -- byte no longer takes part in the hash key, that is:
505 -- H_SHIFT * MIN_MATCH >= HASH_BITS
506 prev_length : Natural_M32; -- [was: unsigned]
507 -- Length of the best match at previous step. Matches not greater than this
508 -- are discarded. This is used in the lazy match evaluation.
509 strstart : Natural_M32; -- start of string to insert [was: unsigned]
510 match_start : Natural_M32; -- start of matching string [was: unsigned]
511 eofile : Boolean; -- flag set at end of input file [was: int]
512 lookahead : Natural_M32; -- number of valid bytes ahead in window [was: unsigned]
513 max_chain_length : unsigned;
514 -- To speed up deflation, hash chains are never searched beyond this length.
515 -- A higher limit improves compression ratio but degrades the speed.
516 max_lazy_match : Natural_M32; -- [was: unsigned]
517 -- Attempt to find a better match only when the current match is strictly
518 -- smaller than this value. This mechanism is used only for compression
519 -- levels >= 4.
520 good_match : Natural_M32; -- [was: unsigned]
521 -- Use a faster search when the previous match is longer than this
522 nice_match : Integer_M32; -- Stop searching when current match exceeds this
523 -- Values for max_lazy_match, good_match, nice_match and max_chain_length,
524 -- depending on the desired pack level (0..9). The values given below have
525 -- been tuned to exclude worst case performance for pathological files.
526 -- Better values may be found for specific files.
527 type config is record
528 good_length : Natural_M32; -- reduce lazy search above this match length [was: ush]
529 max_lazy : Natural_M32; -- do not perform lazy search above this match length
530 nice_length : Integer_M32; -- quit search above this match length
531 max_chain : ush;
532 end record;
533
534 configuration_table : constant array (0 .. 10) of config := (
535 -- good lazy nice chain
536 (0, 0, 0, 0), -- 0: store only
537 (4, 4, 8, 4), -- 1: maximum speed, no lazy matches
538 (4, 5, 16, 8),
539 (4, 6, 32, 32),
540 (4, 4, 16, 16), -- 4: lazy matches
541 (8, 16, 32, 32),
542 (8, 16, 128, 128),
543 (8, 32, 128, 256),
544 (32, 128, 258, 1024),
545 (32, 258, 258, 4096), -- 9: maximum compression
546 (34, 258, 258, 4096)); -- "secret" variant of level 9
547
548 -- Update a hash value with the given input byte
549 -- IN assertion: all calls to to UPDATE_HASH are made with consecutive
550 -- input characters, so that a running hash key can be computed from the
551 -- previous key instead of complete recalculation each time.
552
553 procedure UPDATE_HASH (h : in out unsigned; c : Byte) is
554 pragma Inline (UPDATE_HASH);
555 begin
556 h := (unsigned (Shift_Left (Unsigned_32 (h), H_SHIFT)) xor unsigned (c)) and HASH_MASK;
557 end UPDATE_HASH;
558
559 -- Insert string starting at s in the dictionary and set match_head to the previous head
560 -- of the hash chain (the most recent string with same hash key). Return
561 -- the previous length of the hash chain.
562 -- IN assertion: all calls to to INSERT_STRING are made with consecutive
563 -- input characters and the first MIN_MATCH bytes of s are valid
564 -- (except for the last MIN_MATCH-1 bytes of the input file).
565
566 procedure INSERT_STRING (s : Integer_M32; match_head : out Natural_M32) is
567 pragma Inline (INSERT_STRING);
568 begin
569 UPDATE_HASH (ins_h, window (s + MIN_MATCH - 1));
570 match_head := Natural_M32 (head (ins_h));
571 prev (unsigned (s) and WMASK) := Pos (match_head);
572 head (ins_h) := Pos (s);
573 end INSERT_STRING;
574
575 procedure Read_buf (from : Integer_M32; amount : unsigned; actual : out Integer_M32) is
576 need : unsigned := amount;
577 begin
578 -- put_line("Read buffer: from:" & from'img & "; amount:" & amount'img);
579 actual := 0;
580 while need > 0 and then More_Bytes loop
581 window (from + actual) := Read_Byte;
582 actual := actual + 1;
583 need := need - 1;
584 end loop;
585 -- put_line("Read buffer: actual:" & actual'img);
586 end Read_buf;
587
588 -- Fill the window when the lookahead becomes insufficient.
589 -- Updates strstart and lookahead, and sets eofile if end of input file.
590 --
591 -- IN assertion: lookahead < MIN_LOOKAHEAD && strstart + lookahead > 0
592 -- OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
593 -- At least one byte has been read, or eofile is set; file reads are
594 -- performed for at least two bytes (required for the translate_eol option).
595
596 procedure Fill_window is
597 more : unsigned;
598 m : Pos;
599 n : Natural_M32;
600 begin
601 loop
602 more := unsigned (window_size - ulg (lookahead) - ulg (strstart));
603 if False then -- C: "if (more == (unsigned)EOF) {" ?... GdM: seems a 16-bit code for EOF
604 -- Very unlikely, but possible on 16 bit machine if strstart == 0
605 -- and lookahead == 1 (input done one byte at time)
606 more := more - 1;
607 elsif strstart >= WSIZE + MAX_DIST and then sliding then
608 -- By the IN assertion, the window is not empty so we can't confuse
609 -- more == 0 with more == 64K on a 16 bit machine.
610 window (0 .. WSIZE - 1) := window (WSIZE .. 2 * WSIZE - 1);
611 -- GdM: in rare cases (e.g. level 9 on test file "enwik8"), match_start happens
612 -- to be < WSIZE. We do as in the original 16-bit C code: mod 2**16, such that the
613 -- index is the window's range.
614 -- This assumes WSIZE = 2**15, which is checked at startup of LZ77_using_IZ.
615 -- Very likely, match_start is garbage anyway - see http://sf.net/p/infozip/bugs/49/
616 match_start := Natural_M32 (Unsigned_16 (match_start) - Unsigned_16 (WSIZE mod (2**16)));
617 strstart := strstart - WSIZE; -- we now have strstart >= MAX_DIST:
618 for nn in 0 .. unsigned'(HASH_SIZE - 1) loop
619 m := head (nn);
620 if m >= Pos (WSIZE) then
621 head (nn) := m - Pos (WSIZE);
622 else
623 head (nn) := NIL;
624 end if;
625 end loop;
626 --
627 for nn in 0 .. unsigned (WSIZE - 1) loop
628 m := prev (nn);
629 if m >= Pos (WSIZE) then
630 prev (nn) := m - Pos (WSIZE);
631 else
632 prev (nn) := NIL;
633 end if;
634 -- If n is not on any hash chain, prev[n] is garbage but its value will never be used.
635 end loop;
636 more := more + unsigned (WSIZE);
637 end if;
638 exit when eofile;
639 -- If there was no sliding:
640 -- strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
641 -- more == window_size - lookahead - strstart
642 -- => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
643 -- => more >= window_size - 2*WSIZE + 2
644 -- In the MMAP or BIG_MEM case (not yet supported in gzip),
645 -- window_size == input_size + MIN_LOOKAHEAD &&
646 -- strstart + lookahead <= input_size => more >= MIN_LOOKAHEAD.
647 -- Otherwise, window_size == 2*WSIZE so more >= 2.
648 -- If there was sliding, more >= WSIZE. So in all cases, more >= 2.
649 --
650 pragma Assert (more >= 2, "more < 2");
651 --
652 Read_buf (strstart + lookahead, more, n);
653 if n = 0 then
654 eofile := True;
655 else
656 lookahead := lookahead + n;
657 end if;
658 exit when lookahead >= MIN_LOOKAHEAD or eofile;
659 end loop;
660 -- put_line("Fill done - eofile = " & eofile'img);
661 end Fill_window;
662
663 -- Initialize the "longest match" routines for a new file
664 --
665 -- IN assertion: window_size is > 0 if the input file is already read or
666 -- mapped in the window array, 0 otherwise. In the first case,
667 -- window_size is sufficient to contain the whole input file plus
668 -- MIN_LOOKAHEAD bytes (to avoid referencing memory beyond the end
669 -- of window when looking for matches towards the end).
670
671 procedure LM_Init (pack_level : Natural) is
672 begin
673 -- Do not slide the window if the whole input is already in memory (window_size > 0)
674 sliding := False;
675 if window_size = 0 then
676 sliding := True;
677 window_size := 2 * ulg (WSIZE);
678 end if;
679 -- Initialize the hash table.
680 -- prev will be initialized on the fly.
681 head := (others => NIL);
682 -- Set the default configuration parameters:
683 max_lazy_match := configuration_table (pack_level).max_lazy;
684 good_match := configuration_table (pack_level).good_length;
685 nice_match := configuration_table (pack_level).nice_length;
686 max_chain_length := configuration_table (pack_level).max_chain;
687 -- Info-Zip comment: ??? reduce max_chain_length for binary files
688 strstart := 0;
689 Read_buf (0, unsigned (WSIZE), lookahead);
690 if lookahead = 0 then
691 eofile := True;
692 return;
693 end if;
694 eofile := False;
695 -- Make sure that we always have enough lookahead. This is important
696 -- if input comes from a device such as a tty.
697 if lookahead < MIN_LOOKAHEAD then
698 Fill_window;
699 end if;
700 ins_h := 0;
701 for j in 0 .. Natural_M32 (MIN_MATCH) - 2 loop
702 UPDATE_HASH (ins_h, window (j));
703 end loop;
704 -- If lookahead < MIN_MATCH, ins_h is garbage, but this is
705 -- not important since only literal bytes will be emitted.
706 end LM_Init;
707
708 -- Set match_start to the longest match starting at the given string and
709 -- return its length. Matches shorter or equal to prev_length are discarded,
710 -- in which case the result is equal to prev_length and match_start is
711 -- garbage.
712 -- IN assertions: current_match is the head of the hash chain for the current
713 -- string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
714
715 procedure Longest_Match (current_match : in out Integer_M32; longest : out Integer_M32) is
716 chain_length : unsigned := max_chain_length; -- max hash chain length
717 scan : Integer_M32 := strstart; -- current string
718 match : Integer_M32; -- matched string
719 len : Integer_M32; -- length of current match
720 best_len : Integer_M32 := prev_length; -- best match length so far
721 limit : Natural_M32; -- [was: IPos]
722 strend : constant Integer_M32 := strstart + MAX_MATCH;
723 scan_end : Integer_M32 := scan + best_len;
724 begin
725 -- Stop when current_match becomes <= limit. To simplify the code,
726 -- we prevent matches with the string of window index 0.
727 if strstart > MAX_DIST then
728 limit := strstart - MAX_DIST;
729 else
730 limit := NIL;
731 end if;
732 -- Do not waste too much time if we already have a good match:
733 if prev_length >= good_match then
734 chain_length := chain_length / 4;
735 end if;
736 pragma Assert
737 (strstart <= Integer_M32 (window_size) - MIN_LOOKAHEAD,
738 "insufficient lookahead"); -- In deflate.c
739 loop
740 if current_match >= strstart then
741 -- Added 2020-11-07. The file test/sample.jpg bombs the assertion a few lines later.
742 longest := MIN_MATCH - 1;
743 return;
744 end if;
745 pragma Assert (current_match < strstart, "no future"); -- In deflate.c
746 match := current_match;
747 -- Skip to next match if the match length cannot increase
748 -- or if the match length is less than 2:
749 --
750 -- NB: this is the Not-UNALIGNED_OK variant in the C code.
751 -- Translation of the UNALIGNED_OK variant is left as an exercise ;-).
752 -- (!! worth a try: GNAT optimizes window(match..match+1[3]) to 16[32] bit)
753 --
754 if window (match + best_len) /= window (scan_end) or else
755 window (match + best_len - 1) /= window (scan_end - 1) or else
756 window (match) /= window (scan) or else
757 window (match + 1) /= window (scan + 1)
758 then
759 match := match + 1; -- C: continue
760 else
761 -- The check at best_len - 1 can be removed because it will be made
762 -- again later. (This heuristic is not always a win.)
763 --
764 -- It is not necessary to compare window(scan + 2) and window(match + 2) since they
765 -- are always equal when the other bytes match, given that
766 -- the hash keys are equal and that HASH_BITS >= 8.
767 scan := scan + 2;
768 match := match + 2;
769 -- C: The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
770 -- It is easy to get rid of this optimization if necessary.
771 -- Ada: see the "else" part below.
772 if MAX_MATCH = 258 then
773 -- We check for insufficient lookahead only every 8th comparison;
774 -- the 256th check will be made at strstart + 258.
775 loop
776 scan := scan + 1;
777 match := match + 1;
778 exit when window (scan) /= window (match);
779 scan := scan + 1;
780 match := match + 1;
781 exit when window (scan) /= window (match);
782 scan := scan + 1;
783 match := match + 1;
784 exit when window (scan) /= window (match);
785 scan := scan + 1;
786 match := match + 1;
787 exit when window (scan) /= window (match);
788 scan := scan + 1;
789 match := match + 1;
790 exit when window (scan) /= window (match);
791 scan := scan + 1;
792 match := match + 1;
793 exit when window (scan) /= window (match);
794 scan := scan + 1;
795 match := match + 1;
796 exit when window (scan) /= window (match);
797 scan := scan + 1;
798 match := match + 1;
799 exit when window (scan) /= window (match) or else scan >= strend;
800 end loop;
801 else
802 -- We check for insufficient lookahead after every comparison.
803 loop
804 scan := scan + 1;
805 match := match + 1;
806 exit when window (scan) /= window (match) or else scan >= strend;
807 end loop;
808 end if;
809 -- Assert(scan <= window+(unsigned)(window_size-1), "wild scan"); ??
810 len := MAX_MATCH - (strend - scan);
811 scan := strend - MAX_MATCH;
812 if len > best_len then
813 match_start := current_match;
814 best_len := len;
815 exit when len >= nice_match;
816 scan_end := scan + best_len;
817 end if;
818 end if;
819 current_match := Integer_M32 (prev (unsigned (current_match) and WMASK));
820 exit when current_match <= limit;
821 chain_length := chain_length - 1;
822 exit when chain_length = 0;
823 end loop;
824 longest := best_len;
825 end Longest_Match;
826
827 procedure LZ77_part_of_IZ_Deflate is
828 hash_head : Natural_M32 := NIL; -- head of hash chain
829 prev_match : Natural_M32; -- previous match [was: IPos]
830 match_available : Boolean := False; -- set if previous match exists
831 match_length : Natural_M32 := MIN_MATCH - 1; -- length of best match
832 max_insert : Natural_M32;
833 begin
834 match_start := 0; -- NB: no initialization in deflate.c
835 -- NB: level <= 3 would call deflate_fast;
836 --
837 -- Process the input block.
838 while lookahead /= 0 loop
839 -- Insert the string window(strstart .. strstart + 2) in the
840 -- dictionary, and set hash_head to the head of the hash chain:
841 if lookahead >= MIN_MATCH then
842 INSERT_STRING (strstart, hash_head);
843 end if;
844 -- Find the longest match, discarding those <= prev_length.
845 prev_length := match_length;
846 prev_match := match_start;
847 match_length := MIN_MATCH - 1;
848 if hash_head /= NIL and then
849 prev_length < max_lazy_match and then
850 strstart - hash_head <= MAX_DIST
851 then
852 -- To simplify the code, we prevent matches with the string
853 -- of window index 0 (in particular we have to avoid a match
854 -- of the string with itself at the start of the input file).
855 --
856 -- Do not look for matches beyond the end of the input.
857 -- This is necessary to make deflate deterministic.
858 if nice_match > lookahead then
859 nice_match := lookahead;
860 end if;
861 Longest_Match (hash_head, match_length);
862 -- Longest_Match sets match_start
863 if match_length > lookahead then
864 match_length := lookahead;
865 end if;
866 -- Ignore a length 3 match if it is too distant:
867 if match_length = MIN_MATCH and then strstart - match_start > TOO_FAR then
868 -- If prev_match is also MIN_MATCH, match_start is garbage
869 -- but we will ignore the current match anyway.
870 match_length := MIN_MATCH - 1;
871 end if;
872 end if;
873 -- If there was a match at the previous step and the current
874 -- match is not better, output the previous match:
875 if prev_length >= MIN_MATCH and then match_length <= prev_length then
876 max_insert := strstart + lookahead - MIN_MATCH;
877 -- C: in DEBUG mode: check_match(strstart-1, prev_match, prev_length);
878 --
879 ------------------------------------
880 -- Output a Distance-Length code --
881 ------------------------------------
882 Write_DL_Code (Positive (strstart - 1 - prev_match), Positive (prev_length));
883 -- Insert in hash table all strings up to the end of the match.
884 -- strstart-1 and strstart are already inserted.
885 lookahead := lookahead - (prev_length - 1);
886 prev_length := prev_length - 2;
887 loop
888 strstart := strstart + 1;
889 if strstart <= max_insert then
890 INSERT_STRING (strstart, hash_head);
891 -- strstart never exceeds WSIZE - MAX_MATCH, so there
892 -- are always MIN_MATCH bytes ahead.
893 end if;
894 prev_length := prev_length - 1;
895 exit when prev_length = 0;
896 end loop;
897 strstart := strstart + 1;
898 match_available := False;
899 match_length := MIN_MATCH - 1;
900 elsif match_available then
901 -- If there was no match at the previous position, output a
902 -- single literal. If there was a match but the current match
903 -- is longer, truncate the previous match to a single literal.
904 --
905 ------------------------
906 -- Output a literal --
907 ------------------------
908 Write_Literal (window (strstart - 1));
909 strstart := strstart + 1;
910 lookahead := lookahead - 1;
911 else
912 -- There is no previous match to compare with, wait for the next step to decide.
913 match_available := True;
914 strstart := strstart + 1;
915 lookahead := lookahead - 1;
916 end if;
917 -- Assert(strstart <= isize && lookahead <= isize, "a bit too far");
918 --
919 -- Make sure that we always have enough lookahead, except
920 -- at the end of the input file. We need MAX_MATCH bytes
921 -- for the next match, plus MIN_MATCH bytes to insert the
922 -- string following the next match.
923 if lookahead < MIN_LOOKAHEAD then
924 Fill_window;
925 end if;
926 end loop;
927 -----------------------------------
928 -- Output last literal, if any --
929 -----------------------------------
930 if match_available then
931 Write_Literal (window (strstart - 1));
932 end if;
933 end LZ77_part_of_IZ_Deflate;
934
935 Code_too_clever : exception;
936 begin
937 if Look_Ahead /= 258 or String_buffer_size /= 2 ** 15 or Threshold /= 2 then
938 raise Code_too_clever; -- was optimized for these parameters
939 end if;
940 window_size := 0;
941 LM_Init (level);
942 LZ77_part_of_IZ_Deflate;
943 end LZ77_using_IZ;
944
945 ---------------------------------------------------------------------
946 -- BT4 - Binary tree of match positions selected with --
947 -- the leading 2 to 4 bytes of each possible match. --
948 ---------------------------------------------------------------------
949
950 -- Based on BT4.java and LZMAEncoderFast.java by Lasse Collin,
951 -- itself based on LzFind.c by Igor Pavlov.
952
953 procedure LZ77_using_BT4 is
954 MATCH_LEN_MIN : constant Integer := Threshold + 1;
955 --
956 readPos : Integer := -1;
957 cur_literal : Byte;
958 readLimit : Integer := -1;
959 finishing : constant Boolean := False;
960 writePos : Integer := 0;
961 pendingSize : Integer := 0;
962 --
963 OPTS : constant := 4096;
964 EXTRA_SIZE_BEFORE : constant := OPTS;
965 EXTRA_SIZE_AFTER : constant := OPTS;
966
967 keepSizeBefore : constant Integer := EXTRA_SIZE_BEFORE + String_buffer_size;
968 keepSizeAfter : constant Integer := EXTRA_SIZE_AFTER + Look_Ahead;
969 reserveSize : constant Integer :=
970 Integer'Min (
971 String_buffer_size / 2 +
972 256 * (2 ** 10), -- 256 KiB
973 512 * (2 ** 20) -- 512 MiB
974 );
975 getBufSize : constant Integer := keepSizeBefore + keepSizeAfter + reserveSize;
976
977 type Int_array is array (Natural range <>) of Integer;
978 type p_Int_array is access Int_array;
979 procedure Dispose is new Ada.Unchecked_Deallocation (Int_array, p_Int_array);
980
981 procedure Normalize (positions : in out Int_array; normalizationOffset : Integer) is
982 begin
983 for i in 0 .. positions'Length - 1 loop
984 if positions (i) <= normalizationOffset then
985 positions (i) := 0;
986 else
987 positions (i) := positions (i) - normalizationOffset;
988 end if;
989 end loop;
990 end Normalize;
991
992 function Get_Available return Integer is
993 pragma Inline (Get_Available);
994 begin
995 -- Compared to the Java version: - 1 shift for getting readPos
996 -- in buf'Range upon: cur_literal := buf (readPos);
997 return writePos - readPos - 1;
998 end Get_Available;
999
1000 function Move_Pos (requiredForFlushing, requiredForFinishing : Integer) return Integer is
1001 -- Java name: movePos.
1002 avail : Integer;
1003 begin
1004 pragma Assert (requiredForFlushing >= requiredForFinishing);
1005 readPos := readPos + 1;
1006 avail := Get_Available;
1007 if avail < requiredForFlushing then
1008 if avail < requiredForFinishing or else not finishing
1009 then
1010 pendingSize := pendingSize + 1;
1011 -- GdM: This causes cyclicPos and lzpos not being in sync with readPos.
1012 -- The pendingSize value is there for catching up.
1013 avail := 0;
1014 end if;
1015 end if;
1016 return avail;
1017 end Move_Pos;
1018
1019 function getHash4Size return Integer is
1020 h : Unsigned_32 := Unsigned_32 (String_buffer_size - 1);
1021 begin
1022 h := h or Shift_Right (h, 1);
1023 h := h or Shift_Right (h, 2);
1024 h := h or Shift_Right (h, 4);
1025 h := h or Shift_Right (h, 8);
1026 h := Shift_Right (h, 1);
1027 h := h or 16#FFFF#; -- LzFind.c: "don't change it! It's required for Deflate"
1028 if h > 2 ** 24 then
1029 h := Shift_Right (h, 1);
1030 end if;
1031 return Integer (h + 1);
1032 end getHash4Size;
1033
1034 type p_Byte_Array is access Byte_Array;
1035 procedure Dispose is new Ada.Unchecked_Deallocation (Byte_Array, p_Byte_Array);
1036
1037 package Hash234 is
1038 HASH_2_SIZE : constant := 2 ** 10;
1039 HASH_2_MASK : constant := HASH_2_SIZE - 1;
1040 HASH_3_SIZE : constant := 2 ** 16;
1041 HASH_3_MASK : constant := HASH_3_SIZE - 1;
1042 hash_4_size : constant Integer := getHash4Size;
1043 hash_4_mask : constant Unsigned_32 := Unsigned_32 (hash_4_size) - 1;
1044 --
1045 hash2Table : Int_array (0 .. HASH_2_SIZE - 1) := (others => 0); -- [Initialization added]
1046 hash3Table : Int_array (0 .. HASH_3_SIZE - 1) := (others => 0); -- [Initialization added]
1047 hash4Table : p_Int_array;
1048 --
1049 hash2Value, hash3Value, hash4Value : Unsigned_32 := 0;
1050 --
1051 procedure calcHashes (buf : Byte_Array; off : Integer);
1052 procedure updateTables (pos : Integer);
1053 procedure Normalize (normalizeOffset : Integer);
1054 end Hash234;
1055
1056 package body Hash234 is
1057
1058 crcTable : array (Byte) of Unsigned_32;
1059 CRC32_POLY : constant := 16#EDB8_8320#;
1060
1061 procedure calcHashes (buf : Byte_Array; off : Integer) is
1062 temp : Unsigned_32 := crcTable (buf (off)) xor Unsigned_32 (buf (off + 1));
1063 begin
1064 hash2Value := temp and HASH_2_MASK;
1065 temp := temp xor Shift_Left (Unsigned_32 (buf (off + 2)), 8);
1066 hash3Value := temp and HASH_3_MASK;
1067 temp := temp xor Shift_Left (crcTable (buf (off + 3)), 5);
1068 hash4Value := temp and hash_4_mask;
1069 end calcHashes;
1070
1071 procedure updateTables (pos : Integer) is
1072 begin
1073 hash2Table (Integer (hash2Value)) := pos;
1074 hash3Table (Integer (hash3Value)) := pos;
1075 hash4Table (Integer (hash4Value)) := pos;
1076 end updateTables;
1077
1078 procedure Normalize (normalizeOffset : Integer) is
1079 begin
1080 Normalize (hash2Table, normalizeOffset);
1081 Normalize (hash3Table, normalizeOffset);
1082 Normalize (hash4Table.all, normalizeOffset);
1083 end Normalize;
1084
1085 r : Unsigned_32;
1086 begin
1087 -- NB: heap allocation used only for convenience because of
1088 -- small default stack sizes on some compilers.
1089 hash4Table := new Int_array (0 .. hash_4_size - 1);
1090 hash4Table.all := (others => 0); -- [Initialization added]
1091 for i in Byte loop
1092 r := Unsigned_32 (i);
1093 for j in 0 .. 7 loop
1094 if (r and 1) = 0 then
1095 r := Shift_Right (r, 1);
1096 else
1097 r := Shift_Right (r, 1) xor CRC32_POLY;
1098 end if;
1099 end loop;
1100 crcTable (i) := r;
1101 end loop;
1102 end Hash234;
1103
1104 Nice_Length : constant Integer := Integer'Min (162, Look_Ahead); -- const. was 64
1105 Depth_Limit : constant := 48; -- Alternatively: 16 + Nice_Length / 2
1106
1107 cyclicSize : constant Integer := String_buffer_size; -- Had: + 1;
1108 cyclicPos : Integer := -1;
1109 lzPos : Integer := cyclicSize;
1110
1111 max_dist : constant Integer := cyclicSize - (Look_Ahead + 2);
1112 -- NB: 2020-11-04: added "- (Look_Ahead + 2)" to prevent corruption of
1113 -- the expansion buffer in LZMA.Encoding when DL codes are tested in front
1114 -- of the actual writes, before actual entropy compression (since rev. #850).
1115
1116 package BT4_Algo is
1117 procedure Skip (len : Natural);
1118 pragma Inline (Skip);
1119 procedure Read_One_and_Get_Matches (matches : out Matches_Type);
1120 end BT4_Algo;
1121
1122 buf : p_Byte_Array;
1123 tree : p_Int_array;
1124
1125 package body BT4_Algo is
1126
1127 function Move_Pos_in_BT4 return Integer is
1128 -- Java name: movePos.
1129 avail : constant Integer :=
1130 Move_Pos (requiredForFlushing => Nice_Length,
1131 requiredForFinishing => 4);
1132 normalizationOffset : Integer;
1133 begin
1134 -- Put_Line ("BT4_Algo.Move_Pos_in_BT4");
1135 if avail /= 0 then
1136 lzPos := lzPos + 1;
1137 if lzPos = Integer'Last then
1138 normalizationOffset := Integer'Last - cyclicSize;
1139 Hash234.Normalize (normalizationOffset);
1140 Normalize (tree.all, normalizationOffset);
1141 lzPos := lzPos - normalizationOffset;
1142 end if;
1143 cyclicPos := cyclicPos + 1;
1144 if cyclicPos = cyclicSize then
1145 -- Put_Line("cyclicPos zeroed");
1146 cyclicPos := 0;
1147 end if;
1148 end if;
1149 return avail;
1150 end Move_Pos_in_BT4;
1151
1152 Null_position : constant := -1; -- LzFind.c: kEmptyHashValue, 0
1153
1154 procedure Skip_and_Update_Tree (niceLenLimit : Integer; currentMatch : in out Integer) is
1155 delta0, depth, ptr0, ptr1, pair, len, len0, len1 : Integer;
1156 begin
1157 -- Put("BT4_Algo.Skip_and_Update_Tree... ");
1158 depth := Depth_Limit;
1159 ptr0 := cyclicPos * 2 + 1;
1160 ptr1 := cyclicPos * 2;
1161 len0 := 0;
1162 len1 := 0;
1163 loop
1164 delta0 := lzPos - currentMatch;
1165 if depth = 0 or else delta0 >= max_dist then
1166 tree (ptr0) := Null_position;
1167 tree (ptr1) := Null_position;
1168 return;
1169 end if;
1170 depth := depth - 1;
1171 if cyclicPos - delta0 < 0 then
1172 pair := cyclicSize;
1173 else
1174 pair := 0;
1175 end if;
1176 pair := (cyclicPos - delta0 + pair) * 2;
1177 len := Integer'Min (len0, len1);
1178 -- Match ?
1179 if buf (readPos + len - delta0) = buf (readPos + len) then
1180 -- No need to look for longer matches than niceLenLimit
1181 -- because we only are updating the tree, not returning
1182 -- matches found to the caller.
1183 loop
1184 len := len + 1;
1185 if len = niceLenLimit then
1186 tree (ptr1) := tree (pair);
1187 tree (ptr0) := tree (pair + 1);
1188 return;
1189 end if;
1190 exit when buf (readPos + len - delta0) /= buf (readPos + len);
1191 end loop;
1192 end if;
1193 -- Bytes are no more matching. The past value is either smaller...
1194 if buf (readPos + len - delta0) < buf (readPos + len) then
1195 tree (ptr1) := currentMatch;
1196 ptr1 := pair + 1;
1197 currentMatch := tree (ptr1);
1198 len1 := len;
1199 else -- ... or larger
1200 tree (ptr0) := currentMatch;
1201 ptr0 := pair;
1202 currentMatch := tree (ptr0);
1203 len0 := len;
1204 end if;
1205 end loop;
1206 end Skip_and_Update_Tree;
1207
1208 procedure Skip (len : Natural) is
1209 --
1210 procedure Skip_one is
1211 pragma Inline (Skip_one);
1212 niceLenLimit, avail, currentMatch : Integer;
1213 begin
1214 niceLenLimit := Nice_Length;
1215 avail := Move_Pos_in_BT4;
1216 if avail < niceLenLimit then
1217 if avail = 0 then
1218 return;
1219 end if;
1220 niceLenLimit := avail;
1221 end if;
1222 Hash234.calcHashes (buf.all, readPos);
1223 currentMatch := Hash234.hash4Table (Integer (Hash234.hash4Value));
1224 Hash234.updateTables (lzPos);
1225 Skip_and_Update_Tree (niceLenLimit, currentMatch);
1226 end Skip_one;
1227 --
1228 begin
1229 for count in reverse 1 .. len loop
1230 Skip_one;
1231 end loop;
1232 end Skip;
1233
1234 procedure Read_One_and_Get_Matches (matches : out Matches_Type) is
1235 matchLenLimit : Integer := Look_Ahead;
1236 niceLenLimit : Integer := Nice_Length;
1237 avail : Integer;
1238 delta0, delta2, delta3, currentMatch,
1239 lenBest, depth, ptr0, ptr1, pair, len, len0, len1 : Integer;
1240 begin
1241 -- Put("BT4_Algo.Get_Matches... ");
1242 matches.count := 0;
1243 avail := Move_Pos_in_BT4;
1244 if avail < matchLenLimit then
1245 if avail = 0 then
1246 return;
1247 end if;
1248 matchLenLimit := avail;
1249 if niceLenLimit > avail then
1250 niceLenLimit := avail;
1251 end if;
1252 end if;
1253 --
1254 Hash234.calcHashes (buf.all, readPos);
1255 delta2 := lzPos - Hash234.hash2Table (Integer (Hash234.hash2Value));
1256 delta3 := lzPos - Hash234.hash3Table (Integer (Hash234.hash3Value));
1257 currentMatch := Hash234.hash4Table (Integer (Hash234.hash4Value));
1258 Hash234.updateTables (lzPos);
1259 --
1260 lenBest := 0;
1261 -- See if the hash from the first two bytes found a match.
1262 -- The hashing algorithm guarantees that if the first byte
1263 -- matches, also the second byte does, so there's no need to
1264 -- test the second byte.
1265 if delta2 < max_dist and then buf (readPos - delta2) = buf (readPos) then
1266 -- Match of length 2 found and checked.
1267 lenBest := 2;
1268 matches.count := 1;
1269 matches.dl (matches.count).length := 2;
1270 matches.dl (matches.count).distance := delta2;
1271 end if;
1272 -- See if the hash from the first three bytes found a match that
1273 -- is different from the match possibly found by the two-byte hash.
1274 -- Also here the hashing algorithm guarantees that if the first byte
1275 -- matches, also the next two bytes do.
1276 if delta2 /= delta3 and then delta3 < max_dist
1277 and then buf (readPos - delta3) = buf (readPos)
1278 then
1279 -- Match of length 3 found and checked.
1280 lenBest := 3;
1281 matches.count := matches.count + 1;
1282 matches.dl (matches.count).distance := delta3;
1283 delta2 := delta3;
1284 end if;
1285 -- If a match was found, see how long it is.
1286 if matches.count > 0 then
1287 while lenBest < matchLenLimit and then buf (readPos + lenBest - delta2)
1288 = buf (readPos + lenBest)
1289 loop
1290 lenBest := lenBest + 1;
1291 end loop;
1292 matches.dl (matches.count).length := lenBest;
1293 -- Return if it is long enough (niceLen or reached the end of the dictionary).
1294 if lenBest >= niceLenLimit then
1295 Skip_and_Update_Tree (niceLenLimit, currentMatch);
1296 return;
1297 end if;
1298 end if;
1299 -- A long enough match wasn't found so easily.
1300 -- Look for better matches from the binary tree.
1301 if lenBest < 3 then
1302 lenBest := 3;
1303 end if;
1304 depth := Depth_Limit;
1305 ptr0 := cyclicPos * 2 + 1;
1306 ptr1 := cyclicPos * 2;
1307 len0 := 0;
1308 len1 := 0;
1309 --
1310 loop
1311 delta0 := lzPos - currentMatch;
1312 -- Return if the search depth limit has been reached or
1313 -- if the distance of the potential match exceeds the
1314 -- dictionary size.
1315 if depth = 0 or else delta0 >= max_dist then
1316 tree (ptr0) := Null_position;
1317 tree (ptr1) := Null_position;
1318 return;
1319 end if;
1320 depth := depth - 1;
1321 --
1322 if cyclicPos - delta0 < 0 then
1323 pair := cyclicSize;
1324 else
1325 pair := 0;
1326 end if;
1327 pair := (cyclicPos - delta0 + pair) * 2;
1328 len := Integer'Min (len0, len1);
1329 -- Match ?
1330 if buf (readPos + len - delta0) = buf (readPos + len) then
1331 loop
1332 len := len + 1;
1333 exit when len >= matchLenLimit
1334 or else buf (readPos + len - delta0) /= buf (readPos + len);
1335 end loop;
1336 if len > lenBest then
1337 lenBest := len;
1338 matches.count := matches.count + 1;
1339 matches.dl (matches.count).length := len;
1340 matches.dl (matches.count).distance := delta0;
1341 if len >= niceLenLimit then
1342 tree (ptr1) := tree (pair);
1343 tree (ptr0) := tree (pair + 1);
1344 return;
1345 end if;
1346 end if;
1347 end if;
1348 -- Bytes are no more matching. The past value is either smaller...
1349 if buf (readPos + len - delta0) < buf (readPos + len) then
1350 tree (ptr1) := currentMatch;
1351 ptr1 := pair + 1;
1352 currentMatch := tree (ptr1);
1353 len1 := len;
1354 else -- ... or larger
1355 tree (ptr0) := currentMatch;
1356 ptr0 := pair;
1357 currentMatch := tree (ptr0);
1358 len0 := len;
1359 end if;
1360 end loop;
1361 end Read_One_and_Get_Matches;
1362
1363 begin
1364 -- NB: heap allocation used only for convenience because of
1365 -- small default stack sizes on some compilers.
1366 tree := new Int_array (0 .. cyclicSize * 2 - 1);
1367 for i in tree'Range loop
1368 tree (i) := Null_position;
1369 end loop;
1370 end BT4_Algo;
1371
1372 -- Moves data from the end of the buffer to the beginning, discarding
1373 -- old data and making space for new input.
1374
1375 procedure Move_Window is
1376 -- Java name: moveWindow.
1377 -- Align the move to a multiple of 16 bytes (LZMA-friendly, see pos_bits)
1378 moveOffset : constant Integer := ((readPos + 1 - keepSizeBefore) / 16) * 16;
1379 moveSize : constant Integer := writePos - moveOffset;
1380 begin
1381 -- Put_Line(" Move window, size=" & moveSize'Img & " offset=" & moveOffset'Img);
1382 buf (0 .. moveSize - 1) := buf (moveOffset .. moveOffset + moveSize - 1);
1383 readPos := readPos - moveOffset;
1384 readLimit := readLimit - moveOffset;
1385 writePos := writePos - moveOffset;
1386 end Move_Window;
1387
1388 -- Copies new data into the buffer.
1389 function Fill_Window (len_initial : Integer) return Integer is
1390 -- Java name: fillWindow
1391
1392 -- Process pending data that hasn't been ran through the match finder yet.
1393 -- Run it through the match finder now if there is enough new data
1394 -- available (readPos < readLimit) that the encoder may encode at
1395 -- least one more input byte.
1396 --
1397 procedure processPendingBytes is
1398 oldPendingSize : Integer;
1399 begin
1400 if pendingSize > 0 and then readPos < readLimit then
1401 readPos := readPos - pendingSize;
1402 oldPendingSize := pendingSize;
1403 pendingSize := 0;
1404 BT4_Algo.Skip (oldPendingSize);
1405 end if;
1406 end processPendingBytes;
1407 --
1408 len : Integer := len_initial;
1409 actual_len : Integer := 0;
1410 begin
1411 -- Put_Line("Fill window - start");
1412 -- Move the sliding window if needed.
1413 if readPos >= buf'Length - keepSizeAfter then
1414 Move_Window;
1415 end if;
1416
1417 -- Try to fill the dictionary buffer up to its boundary.
1418 if len > buf'Length - writePos then
1419 len := buf'Length - writePos;
1420 end if;
1421
1422 while len > 0 and then More_Bytes loop
1423 buf (writePos) := Read_Byte;
1424 writePos := writePos + 1;
1425 len := len - 1;
1426 actual_len := actual_len + 1;
1427 end loop;
1428
1429 -- Set the new readLimit but only if there's enough data to allow
1430 -- encoding of at least one more byte.
1431 if writePos >= keepSizeAfter then
1432 readLimit := writePos - keepSizeAfter;
1433 end if;
1434
1435 processPendingBytes;
1436
1437 -- Put_Line("Fill window, requested=" & len_initial'Img & " actual=" & actual_len'Img);
1438 -- Tell the caller how much input we actually copied into the dictionary.
1439 return actual_len;
1440 end Fill_Window;
1441
1442 function Compute_Match_Length (distance, length_limit : Integer) return Natural is
1443 pragma Inline (Compute_Match_Length);
1444 back_pos : constant Integer := readPos - distance;
1445 len : Integer := 0;
1446 begin
1447 if distance < 2 then
1448 return 0;
1449 end if;
1450 -- @ if readPos+len not in buf.all'Range then
1451 -- @ Put("**** readpos " & buf'Last'Img & readPos'Img);
1452 -- @ end if;
1453 -- @ if backPos+len not in buf.all'Range then
1454 -- @ Put("**** backpos " & buf'Last'Img & back_pos'Img);
1455 -- @ end if;
1456 while len < length_limit and then buf (readPos + len) = buf (back_pos + len) loop
1457 len := len + 1;
1458 end loop;
1459 return len;
1460 end Compute_Match_Length;
1461
1462 readAhead : Integer := -1; -- LZMAEncoder.java
1463 -- Small stack of recent distances used for LZMA.
1464 subtype Repeat_stack_range is Integer range 0 .. 3;
1465 -- 1-based distances.
1466 rep_dist : array (Repeat_stack_range) of Distance_Type := (others => 1);
1467 len_rep_dist : array (Repeat_stack_range) of Natural := (others => 0);
1468
1469 function Has_much_smaller_Distance (smallDist, bigDist : Distance_Type) return Boolean is
1470 pragma Inline (Has_much_smaller_Distance);
1471 begin
1472 return (smallDist - 1) < (bigDist - 1) / 128;
1473 end Has_much_smaller_Distance;
1474
1475 best_length_for_rep_dist, best_rep_dist_index : Integer;
1476
1477 procedure Read_One_and_Get_Matches (matches : out Matches_Type) is
1478 avail, len : Integer;
1479 begin
1480 readAhead := readAhead + 1;
1481 --
1482 BT4_Algo.Read_One_and_Get_Matches (matches);
1483 --
1484 if LZMA_friendly then
1485 best_length_for_rep_dist := 0;
1486 avail := Integer'Min (Get_Available, Look_Ahead);
1487 if avail >= MATCH_LEN_MIN then
1488 for rep in Repeat_stack_range loop
1489 len := Compute_Match_Length (rep_dist (rep), avail);
1490 len_rep_dist (rep) := len;
1491 -- Remember the index and length of the best repeated match.
1492 if len > best_length_for_rep_dist then
1493 best_rep_dist_index := rep;
1494 best_length_for_rep_dist := len;
1495 end if;
1496 end loop;
1497 else
1498 for rep in Repeat_stack_range loop
1499 len_rep_dist (rep) := 0; -- No match possible in any case.
1500 end loop;
1501 end if;
1502 end if;
1503 end Read_One_and_Get_Matches;
1504
1505 procedure Get_supplemental_Matches_from_Repeat_Matches (matches : in out Matches_Type) is
1506 len, ins : Integer;
1507 begin
1508 if matches.count = 0 then
1509 if best_length_for_rep_dist >= MATCH_LEN_MIN then
1510 matches.dl (1).distance := rep_dist (best_rep_dist_index);
1511 matches.dl (1).length := best_length_for_rep_dist;
1512 matches.count := 1;
1513 end if;
1514 end if;
1515 for rep in Repeat_stack_range loop
1516 len := len_rep_dist (rep);
1517 if len >= MATCH_LEN_MIN then
1518 ins := 0;
1519 for i in reverse 1 .. matches.count loop
1520 if len = matches.dl (i).length then
1521 if rep_dist (rep) = matches.dl (i).distance then
1522 null; -- Identical match
1523 else
1524 -- Tie: insert the repeat match of same length into the list.
1525 -- If the longest match strategy is applied, the second item is preferred.
1526 if Has_much_smaller_Distance (matches.dl (i).distance, rep_dist (rep)) then
1527 ins := i; -- Insert before
1528 else
1529 ins := i + 1; -- Insert after
1530 end if;
1531 exit;
1532 -- Ada.Text_IO.Put_Line ("Tie");
1533 end if;
1534 elsif i < matches.count then
1535 if len > matches.dl (i).length and then len < matches.dl (i + 1).length then
1536 -- Insert between existing lengths
1537 ins := i + 1;
1538 exit;
1539 -- We don't add len as the shortest length (worsens compression).
1540 ------
1541 -- elsif i = 1
1542 -- and then len >= MATCH_LEN_MIN
1543 -- and then len >= matches.dl (1).length - 1 -- Some reluctance...
1544 -- then
1545 -- ins := 1;
1546 end if;
1547 elsif len > matches.dl (i).length then
1548 -- i = matches.count in this case: add as longest.
1549 ins := i + 1;
1550 exit;
1551 end if;
1552 end loop;
1553 -- We can insert this repeat match at position 'ins'.
1554 if ins > 0 then
1555 for i in reverse ins .. matches.count loop -- Empty if ins > count.
1556 matches.dl (i + 1) := matches.dl (i);
1557 end loop;
1558 matches.dl (ins).distance := rep_dist (rep);
1559 matches.dl (ins).length := len;
1560 matches.count := matches.count + 1;
1561 exit;
1562 end if;
1563 end if;
1564 end loop;
1565 pragma Assert (Are_Matches_Sorted (matches));
1566 end Get_supplemental_Matches_from_Repeat_Matches;
1567
1568 procedure Skip (len : Natural) is
1569 pragma Inline (Skip);
1570 begin
1571 readAhead := readAhead + len;
1572 BT4_Algo.Skip (len);
1573 end Skip;
1574
1575 procedure Reduce_consecutive_max_lengths (m : in out Matches_Type) is
1576 -- Sometimes the BT4 algo returns a long list with consecutive lengths.
1577 -- We try to reduce it, if there is a clear advantage with distances.
1578 begin
1579 while m.count > 1
1580 and then m.dl (m.count).length = m.dl (m.count - 1).length + 1
1581 and then Has_much_smaller_Distance (m.dl (m.count - 1).distance, m.dl (m.count).distance)
1582 loop
1583 m.count := m.count - 1;
1584 end loop;
1585 end Reduce_consecutive_max_lengths;
1586
1587 procedure Show_Matches (m : Matches_Type; phase : String) is
1588 begin
1589 Ada.Text_IO.Put_Line (
1590 phase & " --- Matches: " & Integer'Image (m.count)
1591 );
1592 for i in 1 .. m.count loop
1593 Ada.Text_IO.Put_Line (
1594 " Distance:" & Integer'Image (m.dl (i).distance) &
1595 "; Length:" & Integer'Image (m.dl (i).length)
1596 );
1597 end loop;
1598 end Show_Matches;
1599 pragma Unreferenced (Show_Matches);
1600
1601 matches : Matches_Array (0 .. 1);
1602 current_match_index : Prefetch_Index_Type := 0;
1603 match_trace : DLP_Array (1 .. Max_Length_any_Algo);
1604
1605 procedure Get_Next_Symbol is
1606 new_ld, main : Distance_Length_Pair;
1607
1608 -- This function is for debugging. The matches stored in the 'tree' array
1609 -- may be wrong if the variables cyclicPos, lzPos and readPos are not in sync.
1610 -- The issue seems to have been solved now (rev. 489).
1611 function Is_match_correct (shift : Natural) return Boolean is
1612 begin
1613 for i in reverse -1 + shift .. main.length - 2 + shift loop
1614 if buf (readPos - (main.distance) + i) /= buf (readPos + i) then
1615 return False; -- Should not occur.
1616 end if;
1617 end loop;
1618 return True;
1619 end Is_match_correct;
1620
1621 procedure Send_first_literal_of_match is
1622 begin
1623 Write_Literal (cur_literal);
1624 readAhead := readAhead - 1;
1625 end Send_first_literal_of_match;
1626
1627 procedure Send_DL_code (distance, length : Integer) is
1628 found_repeat : Integer := rep_dist'First - 1;
1629 aux : Integer;
1630 begin
1631 Write_DL_Code (distance, length);
1632 readAhead := readAhead - length;
1633 if LZMA_friendly then
1634 --
1635 -- Manage the stack of recent distances in the same way the "MA" part of LZMA does.
1636 --
1637 for i in rep_dist'Range loop
1638 if distance = rep_dist (i) then
1639 found_repeat := i;
1640 exit;
1641 end if;
1642 end loop;
1643 if found_repeat >= rep_dist'First then
1644 -- Roll the stack of recent distances up to the item with index found_repeat,
1645 -- which becomes first. If found_repeat = rep_dist'First, no actual change occurs.
1646 aux := rep_dist (found_repeat);
1647 for i in reverse rep_dist'First + 1 .. found_repeat loop
1648 rep_dist (i) := rep_dist (i - 1);
1649 end loop;
1650 rep_dist (rep_dist'First) := aux;
1651 else
1652 -- Shift the stack of recent distances; the new distance becomes the first item.
1653 for i in reverse rep_dist'First + 1 .. rep_dist'Last loop
1654 rep_dist (i) := rep_dist (i - 1);
1655 end loop;
1656 rep_dist (0) := distance;
1657 end if;
1658 end if;
1659 end Send_DL_code;
1660
1661 avail, limit : Integer;
1662 index_max_score : Positive;
1663 set_max_score : Prefetch_Index_Type;
1664 hurdle : constant := 40;
1665 begin
1666 -- Get the matches for the next byte unless readAhead indicates
1667 -- that we already got the new matches during the previous call
1668 -- to this procedure.
1669 if readAhead = -1 then
1670 Read_One_and_Get_Matches (matches (current_match_index));
1671 end if;
1672 -- @ if readPos not in buf.all'Range then
1673 -- @ Put("**** " & buf'Last'Img & keepSizeAfter'Img & readPos'Img & writePos'Img);
1674 -- @ end if;
1675 cur_literal := buf (readPos);
1676 -- Get the number of bytes available in the dictionary, but
1677 -- not more than the maximum match length. If there aren't
1678 -- enough bytes remaining to encode a match at all, return
1679 -- immediately to encode this byte as a literal.
1680 avail := Integer'Min (Get_Available, Look_Ahead);
1681 if avail < MATCH_LEN_MIN then
1682 -- Put("[a]");
1683 Send_first_literal_of_match;
1684 return;
1685 end if;
1686
1687 if LZMA_friendly and then best_length_for_rep_dist >= Nice_Length then
1688 Skip (best_length_for_rep_dist - 1);
1689 -- Put_Line("[DL RA]");
1690 Send_DL_code (rep_dist (best_rep_dist_index), best_length_for_rep_dist);
1691 return;
1692 end if;
1693
1694 main := (length => 1, distance => 1);
1695 if matches (current_match_index).count > 0 then
1696 main := matches (current_match_index).dl (matches (current_match_index).count);
1697 if main.length >= Nice_Length then
1698 pragma Assert (Is_match_correct (1));
1699 Skip (main.length - 1);
1700 -- Put_Line("[DL A]" & mainDist'Img & mainLen'Img);
1701 Send_DL_code (main.distance, main.length);
1702 return;
1703 end if;
1704 Reduce_consecutive_max_lengths (matches (current_match_index));
1705 if LZMA_friendly then
1706 Get_supplemental_Matches_from_Repeat_Matches (matches (current_match_index));
1707 end if;
1708 main := matches (current_match_index).dl (matches (current_match_index).count);
1709 --
1710 if main.length = MATCH_LEN_MIN and then main.distance > 128 then
1711 main.length := 1;
1712 end if;
1713 end if;
1714
1715 if LZMA_friendly
1716 and then best_length_for_rep_dist > MATCH_LEN_MIN
1717 and then (best_length_for_rep_dist >= main.length
1718 or else (best_length_for_rep_dist >= main.length - 2 and then main.distance > 2 ** 9)
1719 or else (best_length_for_rep_dist >= main.length - 3 and then main.distance > 2 ** 15))
1720 then
1721 -- Shortcut: we choose the longest repeat match.
1722 Skip (best_length_for_rep_dist - 1);
1723 -- Put_Line("[DL RB]");
1724 Send_DL_code (rep_dist (best_rep_dist_index), best_length_for_rep_dist);
1725 return;
1726 end if;
1727
1728 if main.length < MATCH_LEN_MIN or else avail <= MATCH_LEN_MIN then
1729 -- Put("[b]");
1730 Send_first_literal_of_match;
1731 return;
1732 end if;
1733
1734 -------------------------------------------------------------------------
1735 -- Get the next match. Test if it is better than the current match. --
1736 -- If so, encode the current byte as a literal. --
1737 -------------------------------------------------------------------------
1738 current_match_index := 1 - current_match_index;
1739 Read_One_and_Get_Matches (matches (current_match_index));
1740 --
1741 -- Show_Matches (matches (1 - current_match_index), "------ Old");
1742 -- Show_Matches (matches (current_match_index), " New");
1743 --
1744 if matches (current_match_index).count > 0 then
1745 new_ld := matches (current_match_index).dl (matches (current_match_index).count); -- Longest new match
1746 if (new_ld.length >= main.length + hurdle and then new_ld.distance < main.distance)
1747 or else
1748 (new_ld.length = main.length + hurdle + 1
1749 and then not Has_much_smaller_Distance (main.distance, new_ld.distance))
1750 or else new_ld.length > main.length + hurdle + 1
1751 or else (new_ld.length >= main.length + hurdle - 1
1752 and then main.length >= MATCH_LEN_MIN + 1
1753 and then Has_much_smaller_Distance (new_ld.distance, main.distance))
1754 then
1755 -- We prefer literal, then the new match (or even better!)
1756 Send_first_literal_of_match;
1757 return;
1758 end if;
1759 --
1760 -- Here we compare the scores of both match sets.
1761 --
1762 Reduce_consecutive_max_lengths (matches (current_match_index));
1763 if LZMA_friendly then
1764 Get_supplemental_Matches_from_Repeat_Matches (matches (current_match_index));
1765 end if;
1766 Estimate_DL_Codes (
1767 matches, 1 - current_match_index, (1 => cur_literal),
1768 index_max_score, set_max_score, match_trace
1769 );
1770 if set_max_score = 1 - current_match_index then
1771 -- Old match is seems better.
1772 main := matches (set_max_score).dl (index_max_score);
1773 else
1774 -- We prefer at least a literal, then a new, better match.
1775 Send_first_literal_of_match;
1776 return;
1777 end if;
1778 end if;
1779
1780 if LZMA_friendly then
1781 limit := Integer'Max (main.length - 1, MATCH_LEN_MIN);
1782 for rep in rep_dist'Range loop
1783 if Compute_Match_Length (rep_dist (rep), limit) = limit then
1784 -- A "literal then DL_Code (some distance, main.length - 1)" match
1785 -- is verified and could use the stack of last distances -> got for it!
1786 Send_first_literal_of_match;
1787 return;
1788 end if;
1789 end loop;
1790 end if;
1791
1792 pragma Assert (Is_match_correct (0));
1793 Skip (main.length - 2);
1794 -- Put_Line("[DL B]" & mainDist'Img & mainLen'Img);
1795 Send_DL_code (main.distance, main.length);
1796 end Get_Next_Symbol;
1797
1798 procedure Deallocation is
1799 begin
1800 Dispose (buf);
1801 Dispose (tree);
1802 Dispose (Hash234.hash4Table);
1803 end Deallocation;
1804
1805 actual_written, avail : Integer;
1806 begin
1807 -- NB: heap allocation used only for convenience because of
1808 -- the small default stack sizes on some compilers.
1809 buf := new Byte_Array (0 .. getBufSize);
1810 --
1811 actual_written := Fill_Window (String_buffer_size);
1812 if actual_written > 0 then
1813 loop
1814 Get_Next_Symbol;
1815 avail := Get_Available;
1816 if avail = 0 then
1817 actual_written := Fill_Window (String_buffer_size);
1818 exit when actual_written = 0;
1819 end if;
1820 end loop;
1821 end if;
1822 Deallocation;
1823 exception
1824 when others =>
1825 Deallocation;
1826 raise;
1827 end LZ77_using_BT4;
1828
1829 procedure LZ77_by_Rich is
1830 -- * PROG2.C [lz77a.c] *
1831 -- * Simple Hashing LZ77 Sliding Dictionary Compression Program *
1832 -- * By Rich Geldreich, Jr. October, 1993 *
1833 -- * Originally compiled with QuickC v2.5 in the small model. *
1834 -- * This program uses more efficient code to delete strings from *
1835 -- * the sliding dictionary compared to PROG1.C, at the expense of *
1836 -- * greater memory requirements. See the HashData and DeleteData *
1837 -- * subroutines. *
1838 --
1839 -- Comments by GdM, 2019+ appear in square brackets: [...]
1840
1841 -- Set this to True for a greedy encoder.
1842 GREEDY : constant Boolean := False; -- [original: False]
1843
1844 -- Ratio vs. speed constant [ Is it really a ratio? ].
1845 -- The larger this constant, the better the compression.
1846 MAXCOMPARES : constant := 4096; -- [original: 75; good: 2400; from Info-Zip: 4096]
1847
1848 -- Unused entry code.
1849 NIL : constant := 16#FFFF#;
1850
1851 -- /* bits per symbol- normally 8 for general purpose compression */
1852 -- #define CHARBITS : constant := 8; [ NB: dictionary uses char (byte) ]
1853
1854 -- Minimum match length & maximum match length.
1855 THRESHOLD_Rich : constant := 2;
1856 MATCHBITS : constant := 8; -- [original: 4]
1857 -- [original: 2 ** MATCHBITS + THRESHOLD - 1]
1858 MAXMATCH : constant := 2 ** MATCHBITS + THRESHOLD_Rich; -- 258 is Deflate-friendly.
1859
1860 -- Sliding dictionary size and hash table's size.
1861 -- Some combinations of HASHBITS and THRESHOLD values will not work
1862 -- correctly because of the way this program hashes strings.
1863
1864 DICTBITS : constant := 15; -- [original: 13]
1865 HASHBITS : constant := 13; -- [original: 10]
1866 --
1867 DICTSIZE : constant := 2 ** DICTBITS;
1868 HASHSIZE : constant := 2 ** HASHBITS;
1869
1870 -- # bits to shift after each XOR hash
1871 -- This constant must be high enough so that only THRESHOLD + 1
1872 -- characters are in the hash accumulator at one time.
1873
1874 SHIFTBITS : constant := ((HASHBITS + THRESHOLD_Rich) / (THRESHOLD_Rich + 1));
1875
1876 -- Sector size constants [the dictionary is partitoned in sectors].
1877
1878 SECTORBIT : constant := 13; -- [original: 10; OK: 13]
1879 SECTORLEN : constant := 2 ** SECTORBIT;
1880
1881 HASH_MASK_1 : constant := 16#8000#; -- [ was called HASHFLAG1 ]
1882 HASH_MASK_2 : constant := 16#7FFF#; -- [ was called HASHFLAG2 ]
1883
1884 -- Dictionary plus MAXMATCH extra chars for string comparisions.
1885 dict : array (Integer_M32'(0) .. DICTSIZE + MAXMATCH - 1) of Byte;
1886
1887 subtype Unsigned_int is Unsigned_16;
1888
1889 -- Hash table & link list tables.
1890
1891 -- [ So far we index the hash table with Integer (minimum 16 bit signed) ]
1892 hash : array (0 .. HASHSIZE - 1) of Unsigned_int := (others => NIL);
1893 -- [ nextlink: in lz77a.c: only through DICTSIZE - 1,
1894 -- although Init has: nextlink[DICTSIZE] = NIL. In doubt we set the
1895 -- 'Last to DICTSIZE and fill everything with NIL... ]
1896 nextlink : array (Integer_M32'(0) .. DICTSIZE) of Unsigned_int := (others => NIL);
1897 lastlink : array (Integer_M32'(0) .. DICTSIZE - 1) of Unsigned_int := (others => NIL);
1898
1899 -- Loads dictionary with characters from the input stream.
1900 --
1901 procedure Load_Dict (dictpos : Integer_M32; actually_read : out Integer_M32) is
1902 i : Integer_M32 := 0;
1903 begin
1904 while More_Bytes loop
1905 dict (dictpos + i) := Read_Byte;
1906 i := i + 1;
1907 exit when i = SECTORLEN;
1908 end loop;
1909
1910 -- Since the dictionary is a ring buffer, copy the characters at
1911 -- the very start of the dictionary to the end
1912 -- [this avoids having to use an "and" or a "mod" operator when searching].
1913 --
1914 if dictpos = 0 then
1915 for j in Integer_M32'(0) .. MAXMATCH - 1 loop
1916 dict (j + DICTSIZE) := dict (j);
1917 end loop;
1918 end if;
1919
1920 actually_read := i;
1921 end Load_Dict;
1922
1923 -- Deletes data from the dictionary search structures
1924 -- This is only done when the number of bytes to be
1925 -- compressed exceeds the dictionary's size.
1926 --
1927 procedure Delete_Data (dictpos : Integer_M32) is
1928 j, k : Integer_M32;
1929 begin
1930 -- Delete all references to the sector being deleted.
1931 k := dictpos + SECTORLEN;
1932 for i in dictpos .. k - 1 loop
1933 j := Integer_M32 (lastlink (i));
1934 if (Unsigned_int (j) and HASH_MASK_1) /= 0 then
1935 if j /= NIL then
1936 hash (Integer (Unsigned_int (j) and HASH_MASK_2)) := NIL;
1937 end if;
1938 else
1939 nextlink (j) := NIL;
1940 end if;
1941 end loop;
1942 end Delete_Data;
1943
1944 -- Hash data just entered into dictionary.
1945 -- XOR hashing is used here, but practically any hash function will work.
1946 --
1947 procedure Hash_Data (dictpos, bytestodo : Integer_M32) is
1948 j : Integer;
1949 k : Integer_M32;
1950 begin
1951 if bytestodo <= THRESHOLD_Rich then -- Not enough bytes in sector for match?
1952 nextlink (dictpos .. dictpos + bytestodo - 1) := (others => NIL);
1953 lastlink (dictpos .. dictpos + bytestodo - 1) := (others => NIL);
1954 else
1955 -- Matches can't cross sector boundaries.
1956 nextlink (dictpos + bytestodo - THRESHOLD_Rich .. dictpos + bytestodo - 1) := (others => NIL);
1957 lastlink (dictpos + bytestodo - THRESHOLD_Rich .. dictpos + bytestodo - 1) := (others => NIL);
1958
1959 j := Integer (
1960 Shift_Left (Unsigned_int (dict (dictpos)), SHIFTBITS)
1961 xor
1962 Unsigned_int (dict (dictpos + 1))
1963 );
1964
1965 k := dictpos + bytestodo - THRESHOLD_Rich; -- Calculate end of sector.
1966
1967 for i in dictpos .. k - 1 loop
1968 j := Integer (
1969 (Shift_Left (Unsigned_int (j), SHIFTBITS) and (HASHSIZE - 1))
1970 xor
1971 Unsigned_int (dict (i + THRESHOLD_Rich))
1972 );
1973 lastlink (i) := Unsigned_int (j) or HASH_MASK_1;
1974 nextlink (i) := hash (j);
1975 if nextlink (i) /= NIL then
1976 lastlink (Integer_M32 (nextlink (i))) := Unsigned_int (i);
1977 end if;
1978 hash (j) := Unsigned_int (i);
1979 end loop;
1980 end if;
1981 end Hash_Data;
1982
1983 matchlength, matchpos : Integer_M32;
1984
1985 -- Finds match for string at position dictpos.
1986 -- This search code finds the longest AND closest
1987 -- match for the string at dictpos.
1988 --
1989 procedure Find_Match (dictpos, startlen : Integer_M32) is
1990 i, j : Integer_M32;
1991 match_byte : Byte;
1992 begin
1993 i := dictpos;
1994 matchlength := startlen;
1995 match_byte := dict (dictpos + matchlength);
1996 --
1997 Chances :
1998 for compare_count in 1 .. MAXCOMPARES loop
1999 i := Integer_M32 (nextlink (i)); -- Get next string in list.
2000 if i = NIL then
2001 return;
2002 end if;
2003 --
2004 if dict (i + matchlength) = match_byte then -- Possible larger match?
2005 j := 0;
2006 -- Compare strings.
2007 loop
2008 exit when dict (dictpos + j) /= dict (i + j);
2009 j := j + 1;
2010 exit when j = MAXMATCH;
2011 end loop;
2012 --
2013 if j > matchlength then -- Found larger match?
2014 matchlength := j;
2015 matchpos := i;
2016 if matchlength = MAXMATCH then
2017 return; -- Exit if largest possible match.
2018 end if;
2019 match_byte := dict (dictpos + matchlength);
2020 end if;
2021 end if;
2022 end loop Chances; -- Keep on trying until we run out of chances.
2023 end Find_Match;
2024
2025 -- Finds dictionary matches for characters in current sector.
2026 --
2027 procedure Dict_Search (dictpos, bytestodo : Integer_M32) is
2028 i, j : Integer_M32;
2029 matchlen1, matchpos1 : Integer_M32;
2030 --
2031 procedure Write_literal_pos_i is
2032 pragma Inline (Write_literal_pos_i);
2033 begin
2034 Write_Literal (dict (i));
2035 i := i + 1;
2036 j := j - 1;
2037 end Write_literal_pos_i;
2038 begin
2039 i := dictpos;
2040 j := bytestodo;
2041
2042 if not GREEDY then -- Non-greedy search loop (slow).
2043
2044 while j /= 0 loop -- Loop while there are still characters left to be compressed.
2045 Find_Match (i, THRESHOLD_Rich);
2046
2047 if matchlength > THRESHOLD_Rich then
2048 matchlen1 := matchlength;
2049 matchpos1 := matchpos;
2050
2051 loop
2052 Find_Match (i + 1, matchlen1);
2053
2054 if matchlength > matchlen1 then
2055 matchlen1 := matchlength;
2056 matchpos1 := matchpos;
2057 Write_literal_pos_i;
2058 else
2059 if matchlen1 > j then
2060 matchlen1 := j;
2061 if matchlen1 <= THRESHOLD_Rich then
2062 Write_literal_pos_i;
2063 exit;
2064 end if;
2065 end if;
2066
2067 Write_DL_Code (
2068 length => Integer (matchlen1),
2069 -- [The subtraction happens modulo 2**n, needs to be cleaned modulo 2**DICTSIZE]
2070 distance => Integer ((Unsigned_32 (i) - Unsigned_32 (matchpos1)) and (DICTSIZE - 1))
2071 );
2072 i := i + matchlen1;
2073 j := j - matchlen1;
2074 exit;
2075 end if;
2076 end loop;
2077
2078 else
2079 Write_literal_pos_i;
2080 end if;
2081
2082 end loop; -- while j /= 0
2083
2084 else -- Greedy search loop (fast).
2085
2086 while j /= 0 loop -- Loop while there are still characters left to be compressed.
2087
2088 Find_Match (i, THRESHOLD_Rich);
2089
2090 if matchlength > j then
2091 matchlength := j; -- Clamp matchlength.
2092 end if;
2093
2094 if matchlength > THRESHOLD_Rich then -- Valid match?
2095 Write_DL_Code (
2096 length => Integer (matchlength),
2097 -- [The subtraction happens modulo 2**n, needs to be cleaned modulo 2**DICTSIZE]
2098 distance => Integer ((Unsigned_32 (i) - Unsigned_32 (matchpos)) and (DICTSIZE - 1))
2099 );
2100 i := i + matchlength;
2101 j := j - matchlength;
2102 else
2103 Write_literal_pos_i;
2104 end if;
2105 end loop;
2106
2107 end if; -- Greedy or not.
2108
2109 end Dict_Search;
2110
2111 procedure Encode_Rich is
2112 dictpos, actual_read : Integer_M32 := 0;
2113 deleteflag : Boolean := False;
2114 begin
2115 loop
2116 -- Delete old data from dictionary.
2117 if deleteflag then
2118 Delete_Data (dictpos);
2119 end if;
2120
2121 -- Grab more data to compress.
2122 Load_Dict (dictpos, actual_read);
2123 exit when actual_read = 0;
2124
2125 -- Hash the data.
2126 Hash_Data (dictpos, actual_read);
2127
2128 -- Find dictionary matches.
2129 Dict_Search (dictpos, actual_read);
2130
2131 dictpos := dictpos + SECTORLEN;
2132
2133 -- Wrap back to beginning of dictionary when it's full.
2134 if dictpos = DICTSIZE then
2135 dictpos := 0;
2136 deleteflag := True; -- Ok to delete now.
2137 end if;
2138 end loop;
2139 end Encode_Rich;
2140
2141 begin
2142 Encode_Rich;
2143 end LZ77_by_Rich;
2144
2145 -- The following is for research purposes: compare different LZ77
2146 -- algorithms applied to entropy encoders (Deflate, LZMA, ...).
2147
2148 procedure LZ77_from_Dump_File is
2149 LZ77_Dump : Ada.Text_IO.File_Type;
2150 tag : String (1 .. 3);
2151 Wrong_LZ77_tag : exception;
2152 a, b : Integer;
2153 dummy : Byte;
2154 use Ada.Integer_Text_IO;
2155 begin
2156 -- Pretend we compress the given stream.
2157 -- Entire stream is consumed here.
2158 while More_Bytes loop
2159 dummy := Read_Byte;
2160 end loop;
2161 -- Now send dumped LZ77 data further.
2162 Ada.Text_IO.Open (LZ77_Dump, Ada.Text_IO.In_File, "dump.lz77");
2163 -- File from UnZip.Decompress, or LZMA.Decoding, some_trace = True mode
2164 while not Ada.Text_IO.End_Of_File (LZ77_Dump) loop
2165 Ada.Text_IO.Get (LZ77_Dump, tag);
2166 if tag = "Lit" then
2167 Get (LZ77_Dump, a);
2168 Write_Literal (Byte (a));
2169 elsif tag = "DLE" then
2170 Get (LZ77_Dump, a);
2171 Get (LZ77_Dump, b);
2172 Write_DL_Code (a, b);
2173 else
2174 raise Wrong_LZ77_tag;
2175 end if;
2176 Ada.Text_IO.Skip_Line (LZ77_Dump);
2177 end loop;
2178 Ada.Text_IO.Close (LZ77_Dump);
2179 end LZ77_from_Dump_File;
2180
2181 begin
2182 case Method is
2183 when LZHuf =>
2184 LZ77_using_LZHuf;
2185 when IZ_4 .. IZ_10 =>
2186 LZ77_using_IZ (4 + Method_Type'Pos (Method) - Method_Type'Pos (IZ_4));
2187 when BT4 =>
2188 LZ77_using_BT4;
2189 when Rich =>
2190 LZ77_by_Rich;
2191 when No_LZ77 =>
2192 while More_Bytes loop
2193 Write_Literal (Read_Byte);
2194 end loop;
2195 when Read_LZ77_Codes =>
2196 LZ77_from_Dump_File;
2197 end case;
2198 end Encode;
2199
2200 end LZ77;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.