Source file : zip-compress-deflate.adb
1 -- Legal licensing note:
2
3 -- Copyright (c) 2009 .. 2024 Gautier de Montmollin
4 -- SWITZERLAND
5
6 -- Permission is hereby granted, free of charge, to any person obtaining a copy
7 -- of this software and associated documentation files (the "Software"), to deal
8 -- in the Software without restriction, including without limitation the rights
9 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 -- copies of the Software, and to permit persons to whom the Software is
11 -- furnished to do so, subject to the following conditions:
12
13 -- The above copyright notice and this permission notice shall be included in
14 -- all copies or substantial portions of the Software.
15
16 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 -- THE SOFTWARE.
23
24 -- NB: this is the MIT License, as found on the site
25 -- http://www.opensource.org/licenses/mit-license.php
26
27 -----------------
28 -- The "Deflate" method combines a LZ77 compression
29 -- method with some Huffman encoding gymnastics.
30 --
31 -- Magic numbers in this procedure are adjusted through experimentation and marked with: *Tuned*
32 --
33 -- To do:
34 -- - Taillaule: try with slider and/or initial lz window not centered
35 -- - Taillaule: compare slider to random and fixed in addition to initial
36 -- - Taillaule: try L_sup distance
37 -- - Taillaule: restrict BL_Vector to short LZ distances (long distances perhaps too random)
38 -- - Taillaule: check LZ vector norms on literals only, too (consider distances & lengths as noise)
39 -- - Taillaule: use a corpus of files badly compressed by our Deflate comparatively
40 -- to other Deflates (e.g. 7Z seems better with databases)
41 -- - Add DeflOpt to slowest method, or approximate it by tweaking
42 -- distance and length statistics before computing their Huffman codes, or
43 -- reinvent it by computing the size of emitted codes and trying slight changes
44 -- to the codes' bit lengths.
45 -- - Improve LZ77 compression: see Zip.LZ77 to-do list; check with bypass_LZ77 below
46 -- and various programs based on LZ77 using the trace >= some and the LZ77 dump
47 -- in UnZip.Decompress.
48 -- - Make this procedure standalone & generic like LZMA.Encoding;
49 -- use it in the Zada project (Zlib replacement)
50 --
51 -- Change log:
52 --------------
53 --
54 -- 16-Mar-2016: Taillaule algorithm: first version ready for release.
55 -- 20-Feb-2016: (rev.305) Start of smarter techniques for "Dynamic" encoding: Taillaule algorithm
56 -- 4-Feb-2016: Start of "Dynamic" encoding format (compression structure sent before block)
57 --
58 -- 19-Feb-2011: All distance and length codes implemented.
59 -- 18-Feb-2011: First version working with Deflate fixed and restricted distance & length codes.
60 -- 17-Feb-2011: Created (single-block, "fixed" Huffman encoding).
61
62 with Huffman.Encoding.Length_Limited_Coding;
63 with LZ77;
64
65 with Ada.Text_IO,
66 Ada.Unchecked_Deallocation;
67
68 procedure Zip.Compress.Deflate
69 (input,
70 output : in out Zip_Streams.Root_Zipstream_Type'Class;
71 input_size_known : Boolean;
72 input_size : Zip_64_Data_Size_Type; -- ignored if unknown
73 feedback : Feedback_Proc;
74 method : Deflation_Method;
75 CRC : in out Interfaces.Unsigned_32; -- only updated here
76 crypto : in out CRC_Crypto.Crypto_pack;
77 output_size : out Zip_64_Data_Size_Type;
78 compression_ok : out Boolean) -- indicates compressed < uncompressed
79 is
80 -- Options for testing.
81 -- All should be on False for normal use of this procedure.
82
83 deactivate_scanning : constant Boolean := False; -- Impact analysis of the scanning method
84 trace : constant Boolean := False; -- Log file with details
85 trace_descriptors : constant Boolean := False; -- Additional logging of Huffman descriptors
86
87 -- A log file is used when trace = True.
88 log : Ada.Text_IO.File_Type;
89 log_name : constant String := "Zip.Compress.Deflate.zcd"; -- A CSV with an unusual extension
90 sep : constant Character := ';';
91
92 use Ada.Text_IO;
93 use Interfaces;
94
95 -------------------------------------
96 -- Buffered I/O - byte granularity --
97 -------------------------------------
98
99 IO_buffers : IO_Buffers_Type;
100
101 procedure Put_byte (B : Byte) is -- Put a byte, at the byte granularity level
102 pragma Inline (Put_byte);
103 begin
104 IO_buffers.OutBuf (IO_buffers.OutBufIdx) := B;
105 IO_buffers.OutBufIdx := IO_buffers.OutBufIdx + 1;
106 if IO_buffers.OutBufIdx > IO_buffers.OutBuf'Last then
107 Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
108 end if;
109 end Put_byte;
110
111 procedure Flush_byte_buffer is
112 begin
113 if IO_buffers.OutBufIdx > 1 then
114 Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
115 end if;
116 end Flush_byte_buffer;
117
118 ------------------------------------------------------
119 -- Bit code buffer, for sending data at bit level --
120 ------------------------------------------------------
121
122 -- Output buffer. Bits are inserted starting at the right (least
123 -- significant bits). The width of bit_buffer must be at least 16 bits.
124 subtype U32 is Unsigned_32;
125 bit_buffer : U32 := 0;
126 -- Number of valid bits in bit_buffer. All bits above the last valid bit are always zero.
127 valid_bits : Integer := 0;
128
129 procedure Flush_bit_buffer is
130 begin
131 while valid_bits > 0 loop
132 Put_byte (Byte (bit_buffer and 16#FF#));
133 bit_buffer := Shift_Right (bit_buffer, 8);
134 valid_bits := Integer'Max (0, valid_bits - 8);
135 end loop;
136 bit_buffer := 0;
137 end Flush_bit_buffer;
138
139 -- Bit codes are at most 15 bits for Huffman codes,
140 -- or 13 for explicit codes (distance extra bits).
141 subtype Code_Size_Type is Integer range 1 .. 15;
142
143 -- Send a value on a given number of bits.
144 procedure Put_Bits (code : U32; code_size : Code_Size_Type) with Inline is
145 begin
146 -- Put bits from code at the left of existing ones. They might be shifted away
147 -- partially on the left side (or even entirely if valid_bits is already = 32).
148 bit_buffer := bit_buffer or Shift_Left (code, valid_bits);
149 valid_bits := valid_bits + code_size;
150 if valid_bits > 32 then
151 -- Flush 32 bits to output as 4 bytes
152 Put_byte (Byte (bit_buffer and 16#FF#));
153 Put_byte (Byte (Shift_Right (bit_buffer, 8) and 16#FF#));
154 Put_byte (Byte (Shift_Right (bit_buffer, 16) and 16#FF#));
155 Put_byte (Byte (Shift_Right (bit_buffer, 24) and 16#FF#));
156 valid_bits := valid_bits - 32;
157 -- Empty buffer and put on it the rest of the code
158 bit_buffer := Shift_Right (code, code_size - valid_bits);
159 end if;
160 end Put_Bits;
161
162 ------------------------------------------------------
163 -- Deflate, post LZ encoding, with Huffman encoding --
164 ------------------------------------------------------
165
166 -- The Huffman code set (and therefore the Huffman tree) is completely determined by
167 -- the bit length to be used for reaching leaf nodes, thanks to two special
168 -- rules (explanation in RFC 1951, section 3.2.2).
169 --
170 -- So basically the process is the following:
171 --
172 -- (A) Gather statistics (just counts) for the alphabet.
173 -- (B) Turn these counts into code lengths, by calling Length_limited_Huffman_code_lengths.
174 -- (C) Build Huffman codes (the bits to be sent) with a call to Prepare_Huffman_codes.
175 --
176 -- In short:
177 --
178 -- data -> (A) -> stats -> (B) -> Huffman codes' bit lengths -> (C) -> Huffman codes
179
180 type Bit_Length_Array is array (Natural range <>) of Natural;
181
182 subtype Alphabet_lit_len is Natural range 0 .. 287;
183 subtype Bit_length_array_lit_len is Bit_Length_Array (Alphabet_lit_len);
184 subtype Alphabet_dis is Natural range 0 .. 31;
185 subtype Bit_length_array_dis is Bit_Length_Array (Alphabet_dis);
186
187 type Deflate_Huff_Descriptors is record
188 -- Tree descriptor for Literal, EOB or Length encoding
189 lit_len : Huffman.Encoding.Descriptor (0 .. 287);
190 -- Tree descriptor for Distance encoding
191 dis : Huffman.Encoding.Descriptor (0 .. 31);
192 end record;
193 -- NB: Appnote: "Literal codes 286-287 and distance codes 30-31 are never used
194 -- but participate in the Huffman construction."
195 -- Setting upper bound to 285 for literals leads to invalid codes, sometimes.
196
197 -- Copy bit length vectors into Deflate Huffman descriptors
198
199 function Build_descriptors (
200 bl_for_lit_len : Bit_length_array_lit_len;
201 bl_for_dis : Bit_length_array_dis
202 )
203 return Deflate_Huff_Descriptors
204 is
205 new_d : Deflate_Huff_Descriptors;
206 begin
207 for i in new_d.lit_len'Range loop
208 new_d.lit_len (i) := (bit_length => bl_for_lit_len (i), code => Huffman.Encoding.invalid);
209 if trace_descriptors and then trace and then Is_Open (log) then
210 Put (log, Integer'Image (bl_for_lit_len (i)) & sep);
211 end if;
212 end loop;
213 for i in new_d.dis'Range loop
214 new_d.dis (i) := (bit_length => bl_for_dis (i), code => Huffman.Encoding.invalid);
215 if trace_descriptors and then trace and then Is_Open (log) then
216 Put (log, Integer'Image (bl_for_dis (i)) & sep);
217 end if;
218 end loop;
219 if trace_descriptors and then trace and then Is_Open (log) then
220 New_Line (log);
221 end if;
222 return new_d;
223 end Build_descriptors;
224
225 type Count_type is range 0 .. Zip_64_Data_Size_Type'Last / 2 - 1;
226 type Stats_type is array (Natural range <>) of Count_type;
227
228 -- The following is a translation of Zopfli's OptimizeHuffmanForRle (v. 11-May-2016).
229 -- Possible gain: shorten the compression header containing the Huffman trees' bit lengths.
230 -- Possible loss: since the stats do not correspond anymore exactly to the data
231 -- to be compressed, the Huffman trees might be suboptimal.
232 --
233 -- Zopfli comment:
234 -- Changes the population counts in a way that the consequent Huffman tree
235 -- compression, especially its rle-part, will be more likely to compress this data
236 -- more efficiently.
237 --
238 procedure Tweak_for_better_RLE (counts : in out Stats_type) is
239 length : Integer := counts'Length;
240 stride : Integer;
241 symbol, sum, limit, new_count : Count_type;
242 good_for_rle : array (counts'Range) of Boolean := (others => False);
243 begin
244 -- 1) We don't want to touch the trailing zeros. We may break the
245 -- rules of the format by adding more data in the distance codes.
246 loop
247 if length = 0 then
248 return;
249 end if;
250 exit when counts (length - 1) /= 0;
251 length := length - 1;
252 end loop;
253 -- Now counts(0..length - 1) does not have trailing zeros.
254 --
255 -- 2) Let's mark all population counts that already can be encoded with an rle code.
256 --
257 -- Let's not spoil any of the existing good rle codes.
258 -- Mark any seq of 0's that is longer than 5 as a good_for_rle.
259 -- Mark any seq of non-0's that is longer than 7 as a good_for_rle.
260 symbol := counts (0);
261 stride := 0;
262 for i in 0 .. length loop
263 if i = length or else counts (i) /= symbol then
264 if (symbol = 0 and then stride >= 5) or else (symbol /= 0 and then stride >= 7) then
265 for k in 0 .. stride - 1 loop
266 good_for_rle (i - k - 1) := True;
267 end loop;
268 end if;
269 stride := 1;
270 if i /= length then
271 symbol := counts (i);
272 end if;
273 else
274 stride := stride + 1;
275 end if;
276 end loop;
277 -- 3) Let's replace those population counts that lead to more rle codes.
278 stride := 0;
279 limit := counts (0);
280 sum := 0;
281 for i in 0 .. length loop
282 if i = length or else good_for_rle (i)
283 or else (i > 0 and then good_for_rle (i - 1)) -- Added from Brotli, item #1
284 -- Heuristic for selecting the stride ranges to collapse.
285 or else abs (counts (i) - limit) >= 4
286 then
287 if stride >= 4 or else (stride >= 3 and then sum = 0) then
288 -- The stride must end, collapse what we have, if we have enough (4).
289 -- GdM: new_count is the average of counts on the stride's interval, upper-rounded.
290 new_count := Count_type'Max (1, (sum + Count_type (stride) / 2) / Count_type (stride));
291 if sum = 0 then
292 -- Don't make an all zeros stride to be upgraded to ones.
293 new_count := 0;
294 end if;
295 for k in 0 .. stride - 1 loop
296 -- We don't want to change value at counts(i),
297 -- that is already belonging to the next stride. Thus - 1.
298 counts (i - k - 1) := new_count; -- GdM: Replace histogram value by averaged value.
299 end loop;
300 end if;
301 stride := 0;
302 sum := 0;
303 if i < length - 3 then
304 -- All interesting strides have a count of at least 4, at least when non-zeros.
305 -- GdM: limit is the average of next 4 counts, upper-rounded.
306 limit := (counts (i) + counts (i + 1) + counts (i + 2) + counts (i + 3) + 2) / 4;
307 elsif i < length then
308 limit := counts (i);
309 else
310 limit := 0;
311 end if;
312 end if;
313 stride := stride + 1;
314 if i /= length then
315 sum := sum + counts (i);
316 end if;
317 end loop;
318 end Tweak_for_better_RLE;
319
320 subtype Stats_lit_len_type is Stats_type (Alphabet_lit_len);
321 subtype Stats_dis_type is Stats_type (Alphabet_dis);
322
323 -- Phase (B) : we turn statistics into Huffman bit lengths.
324 function Build_descriptors (
325 stats_lit_len : Stats_lit_len_type;
326 stats_dis : Stats_dis_type
327 )
328 return Deflate_Huff_Descriptors
329 is
330 bl_for_lit_len : Bit_length_array_lit_len;
331 bl_for_dis : Bit_length_array_dis;
332 procedure LLHCL_lit_len is new
333 Huffman.Encoding.Length_Limited_Coding
334 (Alphabet_lit_len, Count_type, Stats_lit_len_type, Bit_length_array_lit_len, 15);
335 procedure LLHCL_dis is new
336 Huffman.Encoding.Length_Limited_Coding
337 (Alphabet_dis, Count_type, Stats_dis_type, Bit_length_array_dis, 15);
338 stats_dis_copy : Stats_dis_type := stats_dis;
339 --
340 procedure Patch_statistics_for_buggy_decoders is
341 -- Workaround for buggy Info-Zip decoder versions.
342 -- See "PatchDistanceCodesForBuggyDecoders" in Zopfli's deflate.c
343 -- NB: here, we patch the statistics and not the resulting bit lengths,
344 -- to be sure we avoid invalid Huffman code sets in the end.
345 -- The decoding bug concerns Zlib v.<= 1.2.1, UnZip v.<= 6.0, WinZip v.<=10.0.
346 used : Natural := 0;
347 begin
348 for i in stats_dis_copy'Range loop
349 if stats_dis_copy (i) /= 0 then
350 used := used + 1;
351 end if;
352 end loop;
353 case used is
354 when 0 => -- No distance code used at all (data must be almost random).
355 stats_dis_copy (0 .. 1) := (1, 1);
356 when 1 =>
357 if stats_dis_copy (0) = 0 then
358 stats_dis_copy (0) := 1; -- Now, code 0 and some other code have non-zero counts.
359 else
360 stats_dis_copy (1) := 1; -- Now, codes 0 and 1 have non-zero counts.
361 end if;
362 when others =>
363 null; -- No workaround needed when 2 or more distance codes are defined.
364 end case;
365 end Patch_statistics_for_buggy_decoders;
366 begin
367 Patch_statistics_for_buggy_decoders;
368 LLHCL_lit_len (stats_lit_len, bl_for_lit_len); -- Call the magic algorithm for setting
369 LLHCL_dis (stats_dis_copy, bl_for_dis); -- up Huffman lengths of both trees
370 return Build_descriptors (bl_for_lit_len, bl_for_dis);
371 end Build_descriptors;
372
373 -- Here is one original part in the Taillaule algorithm: use of basic
374 -- topology (L1, L2 distances) to check similarities between Huffman code sets.
375
376 -- Bit length vector. Convention: 16 is unused bit length (close to the bit length for the
377 -- rarest symbols, 15, and far from the bit length for the most frequent symbols, 1).
378 -- Deflate uses 0 for unused.
379 subtype BL_code is Integer_M32 range 1 .. 16;
380 type BL_vector is array (1 .. 288 + 32) of BL_code;
381
382 function Convert (h : Deflate_Huff_Descriptors) return BL_vector is
383 bv : BL_vector;
384 j : Positive := 1;
385 begin
386 for i in h.lit_len'Range loop
387 if h.lit_len (i).bit_length = 0 then
388 bv (j) := 16;
389 else
390 bv (j) := Integer_M32 (h.lit_len (i).bit_length);
391 end if;
392 j := j + 1;
393 end loop;
394 for i in h.dis'Range loop
395 if h.dis (i).bit_length = 0 then
396 bv (j) := 16;
397 else
398 bv (j) := Integer_M32 (h.dis (i).bit_length);
399 end if;
400 j := j + 1;
401 end loop;
402 return bv;
403 end Convert;
404
405 -- L1 or Manhattan distance
406 function L1_distance (b1, b2 : BL_vector) return Natural_M32 is
407 s : Natural_M32 := 0;
408 begin
409 for i in b1'Range loop
410 s := s + abs (b1 (i) - b2 (i));
411 end loop;
412 return s;
413 end L1_distance;
414
415 -- L1, tweaked
416 --
417 tweak : constant array (BL_code) of Positive_M32 :=
418 -- For the origin of the tweak function, see "za_work.xls", sheet "Deflate".
419 -- function f3 = 0.20 f1 [logarithmic] + 0.80 * identity
420 -- NB: all values are multiplied by 100 for accuracy.
421 (100, 255, 379, 490, 594, 694, 791, 885, 978, 1069, 1159, 1249, 1338, 1426, 1513, 1600);
422 -- Neutral is:
423 -- (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600)
424
425 --
426 function L1_tweaked (b1, b2 : BL_vector) return Natural_M32 is
427 s : Natural_M32 := 0;
428 begin
429 for i in b1'Range loop
430 s := s + abs (tweak (b1 (i)) - tweak (b2 (i)));
431 end loop;
432 return s;
433 end L1_tweaked;
434
435 -- L2 or Euclidean distance
436 function L2_distance_square (b1, b2 : BL_vector) return Natural_M32 is
437 s : Natural_M32 := 0;
438 begin
439 for i in b1'Range loop
440 s := s + (b1 (i) - b2 (i)) ** 2;
441 end loop;
442 return s;
443 end L2_distance_square;
444
445 -- L2, tweaked
446 function L2_tweaked_square (b1, b2 : BL_vector) return Natural_M32 is
447 s : Natural_M32 := 0;
448 begin
449 for i in b1'Range loop
450 s := s + (tweak (b1 (i)) - tweak (b2 (i))) ** 2;
451 end loop;
452 return s;
453 end L2_tweaked_square;
454
455 type Distance_type is (L1, L1_tweaked, L2, L2_tweaked);
456
457 function Similar
458 (h1, h2 : Deflate_Huff_Descriptors;
459 dist_kind : Distance_type;
460 threshold : Natural;
461 comment : String)
462 return Boolean
463 is
464 dist : Natural_M32;
465 thres : Natural_M32 := Natural_M32 (threshold);
466 begin
467 case dist_kind is
468 when L1 =>
469 dist := L1_distance (Convert (h1), Convert (h2));
470 when L1_tweaked =>
471 thres := thres * tweak (1);
472 dist := L1_tweaked (Convert (h1), Convert (h2));
473 when L2 =>
474 thres := thres * thres;
475 dist := L2_distance_square (Convert (h1), Convert (h2));
476 when L2_tweaked =>
477 thres := (thres * thres) * (tweak (1) * tweak (1));
478 dist := L2_tweaked_square (Convert (h1), Convert (h2));
479 end case;
480 if trace then
481 Put_Line (log,
482 "Checking similarity." & sep &
483 Distance_type'Image (dist_kind) & sep &
484 "Distance (ev. x100, **2):" & sep & Integer_M32'Image (dist) & sep & sep &
485 "Threshold (ev. x100, **2):" & sep & Integer_M32'Image (thres) & sep & sep &
486 comment
487 );
488 end if;
489 return dist < thres;
490 end Similar;
491
492 -- Another original part in the Taillaule algorithm: the possibility of recycling
493 -- Huffman codes. It is possible only if previous block was not stored and if
494 -- the new block's used alphabets are included in the old block's used alphabets.
495 function Recyclable (h_old, h_new : Deflate_Huff_Descriptors) return Boolean is
496 begin
497 for i in h_old.lit_len'Range loop
498 if h_old.lit_len (i).bit_length = 0 and h_new.lit_len (i).bit_length > 0 then
499 return False; -- Code used in new, but not in old
500 end if;
501 end loop;
502 for i in h_old.dis'Range loop
503 if h_old.dis (i).bit_length = 0 and h_new.dis (i).bit_length > 0 then
504 return False; -- Code used in new, but not in old
505 end if;
506 end loop;
507 return True;
508 end Recyclable;
509
510 -- Phase (C): the Prepare_Huffman_Codes procedure finds the Huffman code
511 -- for each value, given the bit length imposed as input.
512
513 function Prepare_Huffman_Codes (dhd : Deflate_Huff_Descriptors) return Deflate_Huff_Descriptors
514 is
515 dhd_var : Deflate_Huff_Descriptors := dhd;
516 begin
517 Huffman.Encoding.Prepare_Codes (dhd_var.lit_len, Code_Size_Type'Last, True);
518 Huffman.Encoding.Prepare_Codes (dhd_var.dis, Code_Size_Type'Last, True);
519 return dhd_var;
520 end Prepare_Huffman_Codes;
521
522 -- Emit a variable length Huffman code
523 procedure Put_Huffman_Code (lc : Huffman.Encoding.Length_Code_Pair) is
524 pragma Inline (Put_Huffman_Code);
525 begin
526 -- Huffman code of length 0 should never occur: when constructing
527 -- the code lengths (LLHCL) any single occurrence in the statistics
528 -- will trigger the build of a code length of 1 or more.
529 Put_Bits
530 (code => U32 (lc.code),
531 code_size => Code_Size_Type (lc.bit_length)); -- Range check for length 0 (if enabled).
532 end Put_Huffman_Code;
533
534 -- This is where the "dynamic" Huffman trees are sent before the block's data are sent.
535 --
536 -- The decoder needs to know in advance the pair of trees (1st tree for literals-eob-LZ
537 -- lengths, 2nd tree for LZ distances) for decoding the compressed data.
538 -- But this information takes some room. Fortunately Deflate allows for compressing it
539 -- with a combination of Huffman and Run-Length Encoding (RLE) to make this header smaller.
540 -- Concretely, the trees are described by the bit length of each symbol, so the header's
541 -- content is a vector of length max 320, whose contents are in the 0 .. 18 range and typically
542 -- look like: ... 8, 8, 9, 7, 8, 10, 6, 8, 8, 8, 8, 8, 11, 8, 9, 8, ...
543 -- Clearly this vector has redundancies and can be sent in a compressed form.
544 -- In this example, the RLE will compress the string of 8's with a single code 8, then a code 17
545 -- (repeat x times). Anyway, the very frequent 8's will be encoded with a small number of
546 -- bits (less than the 5 plain bits, or maximum 7 Huffman-encoded bits
547 -- needed for encoding integers in the 0 .. 18 range).
548 --
549 procedure Put_Compression_Structure
550 (dhd : Deflate_Huff_Descriptors;
551 cost_analysis : Boolean; -- If True: just simulate the whole, and count needed bits
552 bits : in out Count_type) -- This is incremented when cost_analysis = True
553 is
554 subtype Alphabet is Integer range 0 .. 18;
555 type Alpha_Array is new Bit_Length_Array (Alphabet);
556 truc_freq, truc_bl : Alpha_Array;
557 truc : Huffman.Encoding.Descriptor (Alphabet);
558 -- Compression structure: cs_bl is the "big" array with all bit lengths
559 -- for compressing data. cs_bl will be sent compressed, too.
560 cs_bl : array (1 .. dhd.lit_len'Length + dhd.dis'Length) of Natural;
561 last_cs_bl : Natural;
562 max_used_lln_code : Alphabet_lit_len := 0;
563 max_used_dis_code : Alphabet_dis := 0;
564 --
565 procedure Concatenate_all_bit_lengths is
566 idx : Natural := 0;
567 begin
568 for a in reverse Alphabet_lit_len loop
569 if dhd.lit_len (a).bit_length > 0 then
570 max_used_lln_code := a;
571 exit;
572 end if;
573 end loop;
574 for a in reverse Alphabet_dis loop
575 if dhd.dis (a).bit_length > 0 then
576 max_used_dis_code := a;
577 exit;
578 end if;
579 end loop;
580 -- Copy bit lengths for both trees into one array, cs_bl.
581 for a in 0 .. max_used_lln_code loop
582 idx := idx + 1;
583 cs_bl (idx) := dhd.lit_len (a).bit_length;
584 end loop;
585 for a in 0 .. max_used_dis_code loop
586 idx := idx + 1;
587 cs_bl (idx) := dhd.dis (a).bit_length;
588 end loop;
589 last_cs_bl := idx;
590 end Concatenate_all_bit_lengths;
591 --
592 extra_bits_needed : constant array (Alphabet) of Natural :=
593 (16 => 2, 17 => 3, 18 => 7, others => 0);
594 --
595 type Emission_mode is (simulate, effective);
596 --
597 procedure Emit_data_compression_structures (emit_mode : Emission_mode) is
598 procedure Emit_data_compression_atom (x : Alphabet; extra_code : U32 := 0) is
599 -- x is a bit length (value in 0..15), or a RLE instruction
600 begin
601 case emit_mode is
602 when simulate =>
603 truc_freq (x) := truc_freq (x) + 1; -- +1 for x's histogram bar
604 when effective =>
605 Put_Huffman_Code (truc (x));
606 declare
607 extra_bits : constant Natural := extra_bits_needed (x);
608 begin
609 if extra_bits > 0 then
610 Put_Bits (extra_code, extra_bits);
611 end if;
612 end;
613 end case;
614 end Emit_data_compression_atom;
615 idx : Natural := 0;
616 rep : Positive; -- Number of times current atom is repeated, >= 1
617 begin
618 -- Emit the bit lengths, with some RLE encoding (Appnote: 5.5.3; RFC 1951: 3.2.7)
619 idx := 1;
620 loop
621 rep := 1; -- Current atom, cs_bl(idx), is repeated 1x so far - obvious, isn't it ?
622 for j in idx + 1 .. last_cs_bl loop
623 exit when cs_bl (j) /= cs_bl (idx);
624 rep := rep + 1;
625 end loop;
626 -- Now rep is the number of repetitions of current atom, including itself.
627 if idx > 1 and then cs_bl (idx) = cs_bl (idx - 1) and then rep >= 3
628 -- Better repeat a long sequence of zeros by using codes 17 or 18
629 -- just after a 138-long previous sequence:
630 and then not (cs_bl (idx) = 0 and then rep > 6)
631 then
632 rep := Integer'Min (rep, 6);
633 Emit_data_compression_atom (16, U32 (rep - 3)); -- 16: "Repeat previous 3 to 6 times"
634 idx := idx + rep;
635 elsif cs_bl (idx) = 0 and then rep >= 3 then
636 -- The 0 bit length may occur on long ranges of an alphabet (unused symbols)
637 if rep <= 10 then
638 Emit_data_compression_atom (17, U32 (rep - 3)); -- 17: "Repeat zero 3 to 10 times"
639 else
640 rep := Integer'Min (rep, 138);
641 Emit_data_compression_atom (18, U32 (rep - 11)); -- 18: "Repeat zero 11 to 138 times"
642 end if;
643 idx := idx + rep;
644 else
645 Emit_data_compression_atom (cs_bl (idx));
646 idx := idx + 1;
647 end if;
648 exit when idx > last_cs_bl;
649 end loop;
650 end Emit_data_compression_structures;
651 -- Alphabet permutation for shortening in-use alphabet.
652 -- After the RLE codes 16, 17, 18 and the bit length 0, which are assumed to be always used,
653 -- the most usual bit lengths (around 8, which is the "neutral" bit length) appear first.
654 -- For example, if the rare bit lengths 1 and 15 don't occur in any of the two Huffman trees
655 -- for LZ data, then codes 1 and 15 have a length 0 in the local Alphabet and we can omit
656 -- sending the last two bit lengths.
657 alphabet_permutation : constant array (Alphabet) of Natural :=
658 (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
659 procedure LLHCL is new
660 Huffman.Encoding.Length_Limited_Coding (Alphabet, Natural, Alpha_Array, Alpha_Array, 7);
661 a_non_zero : Alphabet;
662 begin
663 Concatenate_all_bit_lengths;
664 truc_freq := (others => 0);
665 Emit_data_compression_structures (simulate);
666 -- We have now statistics of all bit lengths occurrences of both Huffman
667 -- trees used for compressing the data.
668 -- We turn these counts into bit lengths for the local tree
669 -- that helps us to store the compression structure in a more compact form.
670 LLHCL (truc_freq, truc_bl); -- Call the magic algorithm for setting up Huffman lengths
671 -- At least lengths for codes 16, 17, 18, 0 will always be sent,
672 -- even if all other bit lengths are 0 because codes 1 to 15 are unused.
673 a_non_zero := 3;
674 for a in Alphabet loop
675 if a > a_non_zero and then truc_bl (alphabet_permutation (a)) > 0 then
676 a_non_zero := a;
677 end if;
678 end loop;
679 if cost_analysis then
680 -- In this mode, no data output: we sum up the exact
681 -- number of bits needed by the compression header.
682 bits := bits + 14 + Count_type (1 + a_non_zero) * 3;
683 for a in Alphabet loop
684 bits := bits + Count_type (truc_freq (a) * (truc_bl (a) + extra_bits_needed (a)));
685 end loop;
686 else
687 -- We output the compression header to the output stream.
688 for a in Alphabet loop
689 truc (a).bit_length := truc_bl (a);
690 end loop;
691 Huffman.Encoding.Prepare_Codes (truc, Code_Size_Type'Last, True);
692 -- Output of the compression structure
693 Put_Bits (U32 (max_used_lln_code - 256), 5); -- max_used_lln_code is always >= 256 = EOB code
694 Put_Bits (U32 (max_used_dis_code), 5);
695 Put_Bits (U32 (a_non_zero - 3), 4);
696 -- Save the local alphabet's Huffman lengths. It's the compression structure
697 -- for compressing the data compression structure. Easy, isn't it ?
698 for a in 0 .. a_non_zero loop
699 Put_Bits (U32 (truc (alphabet_permutation (a)).bit_length), 3);
700 end loop;
701 -- Emit the Huffman lengths for encoding the data, in the local Huffman-encoded fashion.
702 Emit_data_compression_structures (effective);
703 end if;
704 end Put_Compression_Structure;
705
706 End_Of_Block : constant := 256;
707
708 -- Default Huffman trees, for "fixed" blocks, as defined in appnote.txt or RFC 1951
709 default_lit_len_bl : constant Bit_length_array_lit_len :=
710 (0 .. 143 => 8, -- For literals ("plain text" bytes)
711 144 .. 255 => 9, -- For more literals ("plain text" bytes)
712 End_Of_Block => 7, -- For EOB (256)
713 257 .. 279 => 7, -- For length codes
714 280 .. 287 => 8 -- For more length codes
715 );
716 default_dis_bl : constant Bit_length_array_dis := (others => 5);
717
718 Deflate_fixed_descriptors : constant Deflate_Huff_Descriptors :=
719 Prepare_Huffman_Codes (Build_descriptors (default_lit_len_bl, default_dis_bl));
720
721 -- Current tree descriptors
722 curr_descr : Deflate_Huff_Descriptors := Deflate_fixed_descriptors;
723
724 -- Write a normal, "clear-text" (post LZ, pre Huffman), 8-bit character (literal)
725 procedure Put_literal_byte (b : Byte) is
726 begin
727 Put_Huffman_Code (curr_descr.lit_len (Integer (b)));
728 end Put_literal_byte;
729
730 -- Possible ranges for distance and length encoding in the Zip-Deflate format:
731 subtype Length_range is Integer range 3 .. 258;
732 subtype Distance_range is Integer range 1 .. 32768;
733
734 -- This is where LZ distance-length tokens are written to the output stream.
735 -- The Deflate format defines a sort of logarithmic compression, with codes
736 -- for various distance and length ranges, plus extra bits for specifying the
737 -- exact values. The codes are sent as Huffman codes with variable bit lengths
738 -- (nothing to do with the lengths of LZ distance-length tokens).
739
740 -- Length Codes
741 -- ------------
742 -- Extra Extra Extra Extra
743 -- Code Bits Length Code Bits Lengths Code Bits Lengths Code Bits Length(s)
744 -- ---- ---- ------ ---- ---- ------- ---- ---- ------- ---- ---- ---------
745 -- 257 0 3 265 1 11,12 273 3 35-42 281 5 131-162
746 -- 258 0 4 266 1 13,14 274 3 43-50 282 5 163-194
747 -- 259 0 5 267 1 15,16 275 3 51-58 283 5 195-226
748 -- 260 0 6 268 1 17,18 276 3 59-66 284 5 227-257
749 -- 261 0 7 269 2 19-22 277 4 67-82 285 0 258
750 -- 262 0 8 270 2 23-26 278 4 83-98
751 -- 263 0 9 271 2 27-30 279 4 99-114
752 -- 264 0 10 272 2 31-34 280 4 115-130
753 --
754 -- Example: the code # 266 means the LZ length (# of message bytes to be copied)
755 -- shall be 13 or 14, depending on the extra bit value.
756
757 deflate_code_for_lz_length : constant array (Length_range) of Natural :=
758 (3 => 257, -- Codes 257..264, with no extra bit
759 4 => 258,
760 5 => 259,
761 6 => 260,
762 7 => 261,
763 8 => 262,
764 9 => 263,
765 10 => 264,
766 11 .. 12 => 265, -- Codes 265..268, with 1 extra bit
767 13 .. 14 => 266,
768 15 .. 16 => 267,
769 17 .. 18 => 268,
770 19 .. 22 => 269, -- Codes 269..272, with 2 extra bits
771 23 .. 26 => 270,
772 27 .. 30 => 271,
773 31 .. 34 => 272,
774 35 .. 42 => 273, -- Codes 273..276, with 3 extra bits
775 43 .. 50 => 274,
776 51 .. 58 => 275,
777 59 .. 66 => 276,
778 67 .. 82 => 277, -- Codes 277..280, with 4 extra bits
779 83 .. 98 => 278,
780 99 .. 114 => 279,
781 115 .. 130 => 280,
782 131 .. 162 => 281, -- Codes 281..284, with 5 extra bits
783 163 .. 194 => 282,
784 195 .. 226 => 283,
785 227 .. 257 => 284,
786 258 => 285 -- Code 285, with no extra bit
787 );
788
789 extra_bits_for_lz_length_offset : constant array (Length_range) of Integer :=
790 (3 .. 10 | 258 => Huffman.Encoding.invalid, -- just a placeholder, there is no extra bit there!
791 11 .. 18 => 11,
792 19 .. 34 => 19,
793 35 .. 66 => 35,
794 67 .. 130 => 67,
795 131 .. 257 => 131);
796
797 extra_bits_for_lz_length : constant array (Length_range) of Natural :=
798 (3 .. 10 | 258 => 0,
799 11 .. 18 => 1,
800 19 .. 34 => 2,
801 35 .. 66 => 3,
802 67 .. 130 => 4,
803 131 .. 257 => 5);
804
805 procedure Put_DL_code (distance : Distance_range; length : Length_range) is
806 extra_bits : Natural;
807 begin
808 Put_Huffman_Code (curr_descr.lit_len (deflate_code_for_lz_length (length)));
809 -- Extra bits are needed to differentiate lengths sharing the same code.
810 extra_bits := extra_bits_for_lz_length (length);
811 if extra_bits > 0 then
812 -- We keep only the last extra_bits bits of the length (minus given offset).
813 -- Example: if extra_bits = 1, only the parity is sent (0 or 1);
814 -- the rest has been already sent with Put_Huffman_code above.
815 -- Equivalent: x:= x mod (2 ** extra_bits);
816 Put_Bits (
817 U32 (length - extra_bits_for_lz_length_offset (length))
818 and
819 (Shift_Left (U32'(1), extra_bits) - 1),
820 extra_bits);
821 end if;
822 -- Distance Codes
823 -- --------------
824 -- Extra Extra Extra Extra
825 -- Code Bits Dist Code Bits Dist Code Bits Distance Code Bits Distance
826 -- ---- ---- ---- ---- ---- ------ ---- ---- -------- ---- ---- --------
827 -- 0 0 1 8 3 17-24 16 7 257-384 24 11 4097-6144
828 -- 1 0 2 9 3 25-32 17 7 385-512 25 11 6145-8192
829 -- 2 0 3 10 4 33-48 18 8 513-768 26 12 8193-12288
830 -- 3 0 4 11 4 49-64 19 8 769-1024 27 12 12289-16384
831 -- 4 1 5,6 12 5 65-96 20 9 1025-1536 28 13 16385-24576
832 -- 5 1 7,8 13 5 97-128 21 9 1537-2048 29 13 24577-32768
833 -- 6 2 9-12 14 6 129-192 22 10 2049-3072
834 -- 7 2 13-16 15 6 193-256 23 10 3073-4096
835 --
836 --
837 -- Example: the code # 10 means the LZ distance (# positions back in the circular
838 -- message buffer for starting the copy) shall be 33, plus the value given
839 -- by the 4 extra bits (between 0 and 15).
840 --
841 case distance is
842 when 1 .. 4 => -- Codes 0..3, with no extra bit
843 Put_Huffman_Code (curr_descr.dis (distance - 1));
844 when 5 .. 8 => -- Codes 4..5, with 1 extra bit
845 Put_Huffman_Code (curr_descr.dis (4 + (distance - 5) / 2));
846 Put_Bits (U32 ((distance - 5) mod 2), 1);
847 when 9 .. 16 => -- Codes 6..7, with 2 extra bits
848 Put_Huffman_Code (curr_descr.dis (6 + (distance - 9) / 4));
849 Put_Bits (U32 ((distance - 9) mod 4), 2);
850 when 17 .. 32 => -- Codes 8..9, with 3 extra bits
851 Put_Huffman_Code (curr_descr.dis (8 + (distance - 17) / 8));
852 Put_Bits (U32 ((distance - 17) mod 8), 3);
853 when 33 .. 64 => -- Codes 10..11, with 4 extra bits
854 Put_Huffman_Code (curr_descr.dis (10 + (distance - 33) / 16));
855 Put_Bits (U32 ((distance - 33) mod 16), 4);
856 when 65 .. 128 => -- Codes 12..13, with 5 extra bits
857 Put_Huffman_Code (curr_descr.dis (12 + (distance - 65) / 32));
858 Put_Bits (U32 ((distance - 65) mod 32), 5);
859 when 129 .. 256 => -- Codes 14..15, with 6 extra bits
860 Put_Huffman_Code (curr_descr.dis (14 + (distance - 129) / 64));
861 Put_Bits (U32 ((distance - 129) mod 64), 6);
862 when 257 .. 512 => -- Codes 16..17, with 7 extra bits
863 Put_Huffman_Code (curr_descr.dis (16 + (distance - 257) / 128));
864 Put_Bits (U32 ((distance - 257) mod 128), 7);
865 when 513 .. 1024 => -- Codes 18..19, with 8 extra bits
866 Put_Huffman_Code (curr_descr.dis (18 + (distance - 513) / 256));
867 Put_Bits (U32 ((distance - 513) mod 256), 8);
868 when 1025 .. 2048 => -- Codes 20..21, with 9 extra bits
869 Put_Huffman_Code (curr_descr.dis (20 + (distance - 1025) / 512));
870 Put_Bits (U32 ((distance - 1025) mod 512), 9);
871 when 2049 .. 4096 => -- Codes 22..23, with 10 extra bits
872 Put_Huffman_Code (curr_descr.dis (22 + (distance - 2049) / 1024));
873 Put_Bits (U32 ((distance - 2049) mod 1024), 10);
874 when 4097 .. 8192 => -- Codes 24..25, with 11 extra bits
875 Put_Huffman_Code (curr_descr.dis (24 + (distance - 4097) / 2048));
876 Put_Bits (U32 ((distance - 4097) mod 2048), 11);
877 when 8193 .. 16384 => -- Codes 26..27, with 12 extra bits
878 Put_Huffman_Code (curr_descr.dis (26 + (distance - 8193) / 4096));
879 Put_Bits (U32 ((distance - 8193) mod 4096), 12);
880 when 16385 .. 32768 => -- Codes 28..29, with 13 extra bits
881 Put_Huffman_Code (curr_descr.dis (28 + (distance - 16385) / 8192));
882 Put_Bits (U32 ((distance - 16385) mod 8192), 13);
883 end case;
884 end Put_DL_code;
885
886 function Deflate_code_for_LZ_distance (distance : Distance_range) return Natural is
887 begin
888 case distance is
889 when 1 .. 4 => -- Codes 0..3, with no extra bit
890 return distance - 1;
891 when 5 .. 8 => -- Codes 4..5, with 1 extra bit
892 return 4 + (distance - 5) / 2;
893 when 9 .. 16 => -- Codes 6..7, with 2 extra bits
894 return 6 + (distance - 9) / 4;
895 when 17 .. 32 => -- Codes 8..9, with 3 extra bits
896 return 8 + (distance - 17) / 8;
897 when 33 .. 64 => -- Codes 10..11, with 4 extra bits
898 return 10 + (distance - 33) / 16;
899 when 65 .. 128 => -- Codes 12..13, with 5 extra bits
900 return 12 + (distance - 65) / 32;
901 when 129 .. 256 => -- Codes 14..15, with 6 extra bits
902 return 14 + (distance - 129) / 64;
903 when 257 .. 512 => -- Codes 16..17, with 7 extra bits
904 return 16 + (distance - 257) / 128;
905 when 513 .. 1024 => -- Codes 18..19, with 8 extra bits
906 return 18 + (distance - 513) / 256;
907 when 1025 .. 2048 => -- Codes 20..21, with 9 extra bits
908 return 20 + (distance - 1025) / 512;
909 when 2049 .. 4096 => -- Codes 22..23, with 10 extra bits
910 return 22 + (distance - 2049) / 1024;
911 when 4097 .. 8192 => -- Codes 24..25, with 11 extra bits
912 return 24 + (distance - 4097) / 2048;
913 when 8193 .. 16384 => -- Codes 26..27, with 12 extra bits
914 return 26 + (distance - 8193) / 4096;
915 when 16385 .. 32768 => -- Codes 28..29, with 13 extra bits
916 return 28 + (distance - 16385) / 8192;
917 end case;
918 end Deflate_code_for_LZ_distance;
919
920 -----------------
921 -- LZ Buffer --
922 -----------------
923
924 -- We buffer the LZ codes (plain, or distance/length) in order to
925 -- analyse them and try to do smart things.
926
927 max_expand : constant := 14; -- *Tuned* Sometimes it is better to store data and expand short strings
928 code_for_max_expand : constant := 266;
929 subtype Expanded_data is Byte_Buffer (1 .. max_expand);
930
931 type LZ_atom_kind is (plain_byte, distance_length);
932 type LZ_atom is record
933 kind : LZ_atom_kind;
934 plain : Byte;
935 lz_distance : Natural;
936 lz_length : Natural;
937 lz_expanded : Expanded_data;
938 end record;
939
940 -- *Tuned*. Min: 2**14, = 16384 (min half buffer 8192)
941 -- Optimal so far: 2**17
942 LZ_buffer_size : constant := 2**17;
943 type LZ_buffer_index_type is mod LZ_buffer_size;
944 type LZ_buffer_type is array (LZ_buffer_index_type range <>) of LZ_atom;
945
946 empty_lit_len_stat : constant Stats_lit_len_type := (End_Of_Block => 1, others => 0);
947 -- End_Of_Block will have to happen once, but never appears in the LZ statistics...
948 empty_dis_stat : constant Stats_dis_type := (others => 0);
949
950 --
951 -- Compute statistics for both Literal-length, and Distance alphabets, from a LZ buffer
952 --
953 procedure Get_statistics (
954 lzb : in LZ_buffer_type;
955 stats_lit_len : out Stats_lit_len_type;
956 stats_dis : out Stats_dis_type
957 )
958 is
959 lit_len : Alphabet_lit_len;
960 dis : Alphabet_dis;
961 begin
962 stats_lit_len := empty_lit_len_stat;
963 stats_dis := empty_dis_stat;
964 for i in lzb'Range loop
965 case lzb (i).kind is
966 when plain_byte =>
967 lit_len := Alphabet_lit_len (lzb (i).plain);
968 stats_lit_len (lit_len) := stats_lit_len (lit_len) + 1; -- +1 for this literal
969 when distance_length =>
970 lit_len := deflate_code_for_lz_length (lzb (i).lz_length);
971 stats_lit_len (lit_len) := stats_lit_len (lit_len) + 1; -- +1 for this length code
972 dis := Deflate_code_for_LZ_distance (lzb (i).lz_distance);
973 stats_dis (dis) := stats_dis (dis) + 1; -- +1 for this distance code
974 end case;
975 end loop;
976 end Get_statistics;
977
978 --
979 -- Send a LZ buffer using currently defined Huffman codes
980 --
981 procedure Put_LZ_buffer (lzb : LZ_buffer_type) is
982 begin
983 for i in lzb'Range loop
984 case lzb (i).kind is
985 when plain_byte =>
986 Put_literal_byte (lzb (i).plain);
987 when distance_length =>
988 Put_DL_code (lzb (i).lz_distance, lzb (i).lz_length);
989 end case;
990 end loop;
991 end Put_LZ_buffer;
992
993 block_to_finish : Boolean := False;
994 last_block_marked : Boolean := False;
995 type Block_type is (stored, fixed, dynamic, reserved); -- Appnote, 5.5.2
996 -- If last_block_type = dynamic, we may recycle previous block's Huffman codes
997 last_block_type : Block_type := reserved;
998
999 procedure Mark_new_block (last_block_for_stream : Boolean) is
1000 begin
1001 if block_to_finish and last_block_type in fixed .. dynamic then
1002 Put_Huffman_Code (curr_descr.lit_len (End_Of_Block)); -- Finish previous block
1003 end if;
1004 block_to_finish := True;
1005 Put_Bits (code => Boolean'Pos (last_block_for_stream), code_size => 1);
1006 last_block_marked := last_block_for_stream;
1007 end Mark_new_block;
1008
1009 -- Send a LZ buffer completely decoded as literals (LZ compression is discarded)
1010 procedure Expand_LZ_buffer (lzb : LZ_buffer_type; last_block : Boolean) is
1011 b1, b2 : Byte;
1012 to_be_sent : Natural_M32 := 0;
1013 -- to_be_sent is not always equal to lzb'Length: sometimes you have a DL code
1014 mid : LZ_buffer_index_type;
1015 begin
1016 for i in lzb'Range loop
1017 case lzb (i).kind is
1018 when plain_byte =>
1019 to_be_sent := to_be_sent + 1;
1020 when distance_length =>
1021 to_be_sent := to_be_sent + Natural_M32 (lzb (i).lz_length);
1022 end case;
1023 end loop;
1024 if to_be_sent > 16#FFFF# then -- Ow, cannot send all that in one chunk.
1025 -- Instead of a tedious block splitting, just divide and conquer:
1026 mid := LZ_buffer_index_type ((Natural_M32 (lzb'First) + Natural_M32 (lzb'Last)) / 2);
1027 if trace then
1028 Put_Line (log,
1029 "Expand_LZ_buffer: splitting large stored block: " &
1030 LZ_buffer_index_type'Image (lzb'First) &
1031 LZ_buffer_index_type'Image (mid) &
1032 LZ_buffer_index_type'Image (lzb'Last)
1033 );
1034 end if;
1035 Expand_LZ_buffer (lzb (lzb'First .. mid), last_block => False);
1036 Expand_LZ_buffer (lzb (mid + 1 .. lzb'Last), last_block => last_block);
1037 return;
1038 end if;
1039 if trace then
1040 Put_Line (log, "Expand_LZ_buffer: sending" & Natural_M32'Image (to_be_sent) & " 'plain' bytes");
1041 end if;
1042 b1 := Byte (to_be_sent mod 256);
1043 b2 := Byte (to_be_sent / 256);
1044 Mark_new_block (last_block_for_stream => last_block);
1045 last_block_type := stored;
1046 Put_Bits (code => 0, code_size => 2); -- Signals a "stored" block
1047 Flush_bit_buffer; -- Go to byte boundary
1048 Put_byte (b1);
1049 Put_byte (b2);
1050 Put_byte (not b1);
1051 Put_byte (not b2);
1052 for i in lzb'Range loop
1053 case lzb (i).kind is
1054 when plain_byte =>
1055 Put_byte (lzb (i).plain);
1056 when distance_length =>
1057 for j in 1 .. lzb (i).lz_length loop
1058 Put_byte (lzb (i).lz_expanded (j));
1059 end loop;
1060 end case;
1061 end loop;
1062 end Expand_LZ_buffer;
1063
1064 -- Extra bits that need to be sent after various Deflate codes
1065
1066 extra_bits_for_lz_length_code : constant array (257 .. 285) of Natural :=
1067 (257 .. 264 => 0,
1068 265 .. 268 => 1,
1069 269 .. 272 => 2,
1070 273 .. 276 => 3,
1071 277 .. 280 => 4,
1072 281 .. 284 => 5,
1073 285 => 0
1074 );
1075
1076 extra_bits_for_lz_distance_code : constant array (0 .. 29) of Natural :=
1077 (0 .. 3 => 0,
1078 4 .. 5 => 1,
1079 6 .. 7 => 2,
1080 8 .. 9 => 3,
1081 10 .. 11 => 4,
1082 12 .. 13 => 5,
1083 14 .. 15 => 6,
1084 16 .. 17 => 7,
1085 18 .. 19 => 8,
1086 20 .. 21 => 9,
1087 22 .. 23 => 10,
1088 24 .. 25 => 11,
1089 26 .. 27 => 12,
1090 28 .. 29 => 13
1091 );
1092
1093 subtype Long_length_codes is
1094 Alphabet_lit_len range code_for_max_expand + 1 .. Alphabet_lit_len'Last;
1095 zero_bl_long_lengths : constant Stats_type (Long_length_codes) := (others => 0);
1096
1097 -- Send_as_block.
1098 --
1099 -- lzb (can be a slice of the principal buffer) will be sent as:
1100 -- * a new "dynamic" block, preceded by a compression structure header
1101 -- or * the continuation of previous "dynamic" block
1102 -- or * a new "fixed" block, if lz data's Huffman descriptor is close enough to "fixed"
1103 -- or * a new "stored" block, if lz data are too random
1104
1105 procedure Send_as_block (lzb : LZ_buffer_type; last_block : Boolean) is
1106 new_descr, new_descr_2 : Deflate_Huff_Descriptors;
1107 --
1108 procedure Send_fixed_block is
1109 begin
1110 if last_block_type = fixed then
1111 -- Cool, we don't need to mark a block boundary: the Huffman codes are already
1112 -- the expected ones. We can just continue sending the LZ atoms.
1113 null;
1114 else
1115 Mark_new_block (last_block_for_stream => last_block);
1116 curr_descr := Deflate_fixed_descriptors;
1117 Put_Bits (code => 1, code_size => 2); -- Signals a "fixed" block
1118 last_block_type := fixed;
1119 end if;
1120 Put_LZ_buffer (lzb);
1121 end Send_fixed_block;
1122 --
1123 stats_lit_len, stats_lit_len_2 : Stats_lit_len_type;
1124 stats_dis, stats_dis_2 : Stats_dis_type;
1125 --
1126 procedure Send_dynamic_block (dyn : Deflate_Huff_Descriptors) is
1127 dummy : Count_type := 0;
1128 begin
1129 Mark_new_block (last_block_for_stream => last_block);
1130 curr_descr := Prepare_Huffman_Codes (dyn);
1131 Put_Bits (code => 2, code_size => 2); -- Signals a "dynamic" block
1132 Put_Compression_Structure (curr_descr, cost_analysis => False, bits => dummy);
1133 Put_LZ_buffer (lzb);
1134 last_block_type := dynamic;
1135 end Send_dynamic_block;
1136 -- The following variables will contain the *exact* number of bits taken
1137 -- by the block to be sent, using different Huffman encodings, or stored.
1138 stored_format_bits, -- Block is stored (no compression)
1139 fixed_format_bits, -- Fixed (preset) Huffman codes
1140 dynamic_format_bits, -- Dynamic Huffman codes using block's statistics
1141 dynamic_format_bits_2, -- Dynamic Huffman codes after Tweak_for_better_RLE
1142 recycled_format_bits : Count_type := 0; -- Continue previous block, use current Huffman codes
1143 --
1144 stored_format_possible : Boolean; -- Can we store (needs expansion of DL codes) ?
1145 recycling_possible : Boolean; -- Can we recycle current Huffman codes ?
1146 --
1147 procedure Compute_sizes_of_variants is
1148 c : Count_type;
1149 extra : Natural;
1150 begin
1151 -- We count bits taken by literals, for each block format variant.
1152 for i in 0 .. 255 loop
1153 c := stats_lit_len (i); -- This literal appears c times in the LZ buffer
1154 stored_format_bits := stored_format_bits + 8 * c;
1155 fixed_format_bits := fixed_format_bits + Count_type (default_lit_len_bl (i)) * c;
1156 dynamic_format_bits := dynamic_format_bits + Count_type (new_descr.lit_len (i).bit_length) * c;
1157 dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.lit_len (i).bit_length) * c;
1158 recycled_format_bits := recycled_format_bits + Count_type (curr_descr.lit_len (i).bit_length) * c;
1159 end loop;
1160 -- We count bits taken by DL codes.
1161 if stored_format_possible then
1162 for i in lzb'Range loop
1163 case lzb (i).kind is
1164 when plain_byte =>
1165 null; -- Already counted
1166 when distance_length =>
1167 -- In the stored format, DL codes are expanded
1168 stored_format_bits := stored_format_bits + 8 * Count_type (lzb (i).lz_length);
1169 end case;
1170 end loop;
1171 end if;
1172 -- For compressed formats, count Huffman bits and extra bits.
1173 -- Lengths codes:
1174 for i in 257 .. 285 loop
1175 c := stats_lit_len (i); -- This length code appears c times in the LZ buffer
1176 extra := extra_bits_for_lz_length_code (i);
1177 fixed_format_bits := fixed_format_bits + Count_type (default_lit_len_bl (i) + extra) * c;
1178 dynamic_format_bits := dynamic_format_bits + Count_type (new_descr.lit_len (i).bit_length + extra) * c;
1179 dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.lit_len (i).bit_length + extra) * c;
1180 recycled_format_bits := recycled_format_bits + Count_type (curr_descr.lit_len (i).bit_length + extra) * c;
1181 end loop;
1182 -- Distance codes:
1183 for i in 0 .. 29 loop
1184 c := stats_dis (i); -- This distance code appears c times in the LZ buffer
1185 extra := extra_bits_for_lz_distance_code (i);
1186 fixed_format_bits := fixed_format_bits + Count_type (default_dis_bl (i) + extra) * c;
1187 dynamic_format_bits := dynamic_format_bits + Count_type (new_descr.dis (i).bit_length + extra) * c;
1188 dynamic_format_bits_2 := dynamic_format_bits_2 + Count_type (new_descr_2.dis (i).bit_length + extra) * c;
1189 recycled_format_bits := recycled_format_bits + Count_type (curr_descr.dis (i).bit_length + extra) * c;
1190 end loop;
1191 -- Supplemental bits to be counted
1192 --
1193 stored_format_bits := stored_format_bits +
1194 (1 + (stored_format_bits / 8) / 65_535) -- Number of stored blocks needed
1195 * 5 -- 5 bytes per header
1196 * 8; -- ... converted into bits
1197 --
1198 c := 1; -- Is-last-block flag
1199 if block_to_finish and last_block_type in fixed .. dynamic then
1200 c := c + Count_type (curr_descr.lit_len (End_Of_Block).bit_length);
1201 end if;
1202 stored_format_bits := stored_format_bits + c;
1203 fixed_format_bits := fixed_format_bits + c + 2;
1204 dynamic_format_bits := dynamic_format_bits + c + 2;
1205 dynamic_format_bits_2 := dynamic_format_bits_2 + c + 2;
1206 -- For both dynamic formats, we also counts the bits taken by the compression header!
1207 Put_Compression_Structure (new_descr, cost_analysis => True, bits => dynamic_format_bits);
1208 Put_Compression_Structure (new_descr_2, cost_analysis => True, bits => dynamic_format_bits_2);
1209 end Compute_sizes_of_variants;
1210 --
1211 optimal_format_bits : Count_type;
1212 begin
1213 Get_statistics (lzb, stats_lit_len, stats_dis);
1214 new_descr := Build_descriptors (stats_lit_len, stats_dis);
1215 stats_lit_len_2 := stats_lit_len;
1216 stats_dis_2 := stats_dis;
1217 Tweak_for_better_RLE (stats_lit_len_2);
1218 Tweak_for_better_RLE (stats_dis_2);
1219 new_descr_2 := Build_descriptors (stats_lit_len_2, stats_dis_2);
1220 -- For "stored" block format, prevent expansion of DL codes with length > max_expand.
1221 -- We check stats are all 0 for long length codes:
1222 stored_format_possible := stats_lit_len (Long_length_codes) = zero_bl_long_lengths;
1223 recycling_possible :=
1224 last_block_type = fixed -- The "fixed" alphabets use all symbols, then always recyclable.
1225 or else
1226 (last_block_type = dynamic and then Recyclable (curr_descr, new_descr));
1227 Compute_sizes_of_variants;
1228 if not stored_format_possible then
1229 stored_format_bits := Count_type'Last;
1230 end if;
1231 if not recycling_possible then
1232 recycled_format_bits := Count_type'Last;
1233 end if;
1234 optimal_format_bits := Count_type'Min (
1235 Count_type'Min (stored_format_bits, fixed_format_bits),
1236 Count_type'Min (
1237 Count_type'Min (dynamic_format_bits, dynamic_format_bits_2),
1238 recycled_format_bits)
1239 );
1240 --
1241 -- Selection of the block format with smallest size.
1242 --
1243 if fixed_format_bits = optimal_format_bits then
1244 if trace then
1245 Put_Line (log, "### New ""fixed"" block");
1246 end if;
1247 Send_fixed_block;
1248 elsif dynamic_format_bits = optimal_format_bits then
1249 if trace then
1250 Put_Line (log, "### New ""dynamic"" block with compression structure header");
1251 end if;
1252 Send_dynamic_block (new_descr);
1253 elsif dynamic_format_bits_2 = optimal_format_bits then
1254 if trace then
1255 Put_Line (log, "### New ""dynamic"" block, RLE-tweaked, with compression structure header");
1256 end if;
1257 Send_dynamic_block (new_descr_2);
1258 elsif recycled_format_bits = optimal_format_bits then
1259 if trace then
1260 Put_Line (log, "### Recycle: continue using existing Huffman compression structures");
1261 end if;
1262 Put_LZ_buffer (lzb);
1263 else -- We have stored_format_bits = optimal_format_bits
1264 if trace then
1265 Put_Line (log, "### Too random - use ""stored"" block");
1266 end if;
1267 Expand_LZ_buffer (lzb, last_block);
1268 end if;
1269 end Send_as_block;
1270
1271 subtype Full_range_LZ_buffer_type is LZ_buffer_type (LZ_buffer_index_type);
1272 type p_Full_range_LZ_buffer_type is access Full_range_LZ_buffer_type;
1273 procedure Dispose is
1274 new Ada.Unchecked_Deallocation (Full_range_LZ_buffer_type, p_Full_range_LZ_buffer_type);
1275
1276 -- This is the main, big, fat, circular buffer containing LZ codes,
1277 -- each LZ code being a literal or a DL code.
1278 -- Heap allocation is needed only because default stack is too small on some targets.
1279 lz_buffer : p_Full_range_LZ_buffer_type := null;
1280 lz_buffer_index : LZ_buffer_index_type := 0;
1281 past_lz_data : Boolean := False;
1282 -- When True: some LZ_buffer_size data before lz_buffer_index (modulo!) are real, past data
1283
1284 ---------------------------------------------------------------------------------
1285 -- Scanning and sampling: the really sexy part of the Taillaule algorithm... --
1286 ---------------------------------------------------------------------------------
1287
1288 -- We examine similarities in the LZ data flow at different step sizes.
1289 -- If the optimal Huffman encoding for this portion is very different, we choose to
1290 -- cut current block and start a new one. The shorter the step, the higher the threshold
1291 -- for starting a dynamic block, since the compression header is taking some room each time.
1292
1293 -- *Tuned* (a bit...)
1294 min_step : constant := 750;
1295
1296 type Step_threshold_metric is record
1297 slider_step : LZ_buffer_index_type; -- Should be a multiple of min_step.
1298 cutting_threshold : Positive;
1299 metric : Distance_type;
1300 end record;
1301
1302 -- *Tuned* thresholds
1303 -- NB: the enwik8, then silesia, then others tests are tough for lowering any!
1304 step_choice : constant array (Positive range <>) of Step_threshold_metric :=
1305 ((8 * min_step, 420, L1_tweaked), -- Deflate_1, Deflate_2, Deflate_3 (enwik8)
1306 (4 * min_step, 430, L1_tweaked), -- Deflate_2, Deflate_3 (silesia)
1307 (min_step, 2050, L1_tweaked) -- Deflate_3 (DB test)
1308 );
1309
1310 max_choice : constant array (Taillaule_Deflation_Method) of Positive :=
1311 (Deflate_1 => 1, Deflate_2 => 2, others => step_choice'Last);
1312
1313 slider_size : constant := 4096;
1314 half_slider_size : constant := slider_size / 2;
1315 slider_max : constant := slider_size - 1;
1316
1317 -- Phases (A) and (B) are done in a single function: we get Huffman
1318 -- descriptors that should be good for encoding a given sequence of LZ atoms.
1319 function Build_descriptors (lzb : LZ_buffer_type) return Deflate_Huff_Descriptors is
1320 stats_lit_len : Stats_lit_len_type;
1321 stats_dis : Stats_dis_type;
1322 begin
1323 Get_statistics (lzb, stats_lit_len, stats_dis);
1324 return Build_descriptors (stats_lit_len, stats_dis);
1325 end Build_descriptors;
1326
1327 procedure Scan_and_send_from_main_buffer (from, to : LZ_buffer_index_type; last_flush : Boolean) is
1328 -- The following descriptors are *not* used for compressing, but for detecting similarities.
1329 initial_hd, sliding_hd : Deflate_Huff_Descriptors;
1330 start, slide_mid, send_from : LZ_buffer_index_type;
1331 sliding_hd_computed : Boolean;
1332 begin
1333 if to - from < slider_max then
1334 Send_as_block (lz_buffer (from .. to), last_flush);
1335 return;
1336 end if;
1337 -- For further comments: n := LZ_buffer_size
1338 if past_lz_data then -- We have n / 2 previous data before 'from'.
1339 start := from - LZ_buffer_index_type (half_slider_size);
1340 else
1341 start := from; -- Cannot have past data
1342 end if;
1343 if start > from then -- Looped over, (mod n). Slider data are in two chunks in main buffer
1344 -- put_line(from'img & to'img & start'img);
1345 declare
1346 copy_from : LZ_buffer_index_type := start;
1347 copy : LZ_buffer_type (0 .. slider_max);
1348 begin
1349 for i in copy'Range loop
1350 copy (i) := lz_buffer (copy_from);
1351 copy_from := copy_from + 1; -- Loops over (mod n)
1352 end loop;
1353 initial_hd := Build_descriptors (copy);
1354 end;
1355 -- Concatenation instead of above loop bombs with a Range Check error:
1356 -- lz_buffer(start .. lz_buffer'Last) &
1357 -- lz_buffer(0 .. start + LZ_buffer_index_type(slider_max))
1358 else
1359 initial_hd := Build_descriptors (lz_buffer (start .. start + slider_max));
1360 end if;
1361 send_from := from;
1362 slide_mid := from + min_step;
1363 Scan_LZ_data :
1364 while Integer_M32 (slide_mid) + half_slider_size < Integer_M32 (to) loop
1365 exit Scan_LZ_data when deactivate_scanning;
1366 sliding_hd_computed := False;
1367 Browse_step_level :
1368 for level in step_choice'Range loop
1369 exit Browse_step_level when level > max_choice (method);
1370 if (slide_mid - from) mod step_choice (level).slider_step = 0 then
1371 if not sliding_hd_computed then
1372 sliding_hd := Build_descriptors (lz_buffer (slide_mid - half_slider_size .. slide_mid + half_slider_size));
1373 sliding_hd_computed := True;
1374 end if;
1375 if not Similar (
1376 initial_hd,
1377 sliding_hd,
1378 step_choice (level).metric,
1379 step_choice (level).cutting_threshold,
1380 "Compare sliding to initial (step size=" &
1381 LZ_buffer_index_type'Image (step_choice (level).slider_step) & ')'
1382 )
1383 then
1384 if trace then
1385 Put_Line (log,
1386 "### Cutting @ " & LZ_buffer_index_type'Image (slide_mid) &
1387 " ('from' is" & LZ_buffer_index_type'Image (from) &
1388 ", 'to' is" & LZ_buffer_index_type'Image (to) & ')'
1389 );
1390 end if;
1391 Send_as_block (lz_buffer (send_from .. slide_mid - 1), last_block => False);
1392 send_from := slide_mid;
1393 initial_hd := sliding_hd; -- Reset reference descriptor for further comparisons
1394 exit Browse_step_level; -- Cutting once at a given place is enough :-)
1395 end if;
1396 end if;
1397 end loop Browse_step_level;
1398 -- Exit before an eventual increment of slide_mid that would loop over (mod n).
1399 exit Scan_LZ_data when Integer_M32 (slide_mid) + min_step + half_slider_size >= Integer_M32 (to);
1400 slide_mid := slide_mid + min_step;
1401 end loop Scan_LZ_data;
1402 --
1403 -- Send last block for slice from .. to.
1404 --
1405 if send_from <= to then
1406 Send_as_block (lz_buffer (send_from .. to), last_block => last_flush);
1407 end if;
1408 end Scan_and_send_from_main_buffer;
1409
1410 procedure Flush_half_buffer (last_flush : Boolean) is
1411 last_idx : constant LZ_buffer_index_type := lz_buffer_index - 1;
1412 n_div_2 : constant := LZ_buffer_size / 2;
1413 begin
1414 if last_idx < n_div_2 then
1415 Scan_and_send_from_main_buffer (0, last_idx, last_flush); -- 1st half
1416 else
1417 Scan_and_send_from_main_buffer (n_div_2, last_idx, last_flush); -- 2nd half
1418 end if;
1419 -- From this point, all further calls to Flush_half_buffer will
1420 -- have n_div_2 elements of past data.
1421 past_lz_data := True;
1422 end Flush_half_buffer;
1423
1424 procedure Push (a : LZ_atom) is
1425 pragma Inline (Push);
1426 begin
1427 lz_buffer (lz_buffer_index) := a;
1428 lz_buffer_index := lz_buffer_index + 1; -- becomes 0 when reaching LZ_buffer_size (modular)
1429 if lz_buffer_index * 2 = 0 then
1430 Flush_half_buffer (last_flush => False);
1431 end if;
1432 end Push;
1433
1434 procedure Put_or_delay_literal_byte (b : Byte) is
1435 pragma Inline (Put_or_delay_literal_byte);
1436 begin
1437 case method is
1438 when Deflate_Fixed =>
1439 Put_literal_byte (b); -- Buffering is not needed in this mode
1440 when Taillaule_Deflation_Method =>
1441 Push ((plain_byte, b, 0, 0, (b, others => 0)));
1442 end case;
1443 end Put_or_delay_literal_byte;
1444
1445 procedure Put_or_delay_DL_code (distance, length : Integer; expand : Expanded_data) is
1446 pragma Inline (Put_or_delay_DL_code);
1447 begin
1448 case method is
1449 when Deflate_Fixed =>
1450 Put_DL_code (distance, length); -- Buffering is not needed in this mode
1451 when Taillaule_Deflation_Method =>
1452 Push ((distance_length, 0, distance, length, expand));
1453 end case;
1454 end Put_or_delay_DL_code;
1455
1456 --------------------------------
1457 -- LZ77 front-end compression --
1458 --------------------------------
1459
1460 procedure Encode is
1461
1462 feedback_milestone,
1463 Bytes_in : Zip_Streams.ZS_Size_Type := 0; -- Count of input file bytes processed
1464 user_aborting : Boolean;
1465 PctDone : Natural;
1466
1467 function Read_byte return Byte is
1468 b : Byte;
1469 use Zip_Streams;
1470 begin
1471 b := IO_buffers.InBuf (IO_buffers.InBufIdx);
1472 IO_buffers.InBufIdx := IO_buffers.InBufIdx + 1;
1473 Zip.CRC_Crypto.Update (CRC, (1 => b));
1474 Bytes_in := Bytes_in + 1;
1475 if feedback /= null then
1476 if Bytes_in = 1 then
1477 feedback (0, False, user_aborting);
1478 end if;
1479 if feedback_milestone > 0 and then
1480 ((Bytes_in - 1) mod feedback_milestone = 0
1481 or Bytes_in = ZS_Size_Type (input_size))
1482 then
1483 if input_size_known then
1484 PctDone := Integer ((100.0 * Float (Bytes_in)) / Float (input_size));
1485 feedback (PctDone, False, user_aborting);
1486 else
1487 feedback (0, False, user_aborting);
1488 end if;
1489 if user_aborting then
1490 raise User_abort;
1491 end if;
1492 end if;
1493 end if;
1494 return b;
1495 end Read_byte;
1496
1497 function More_bytes return Boolean is
1498 begin
1499 if IO_buffers.InBufIdx > IO_buffers.MaxInBufIdx then
1500 Read_Block (IO_buffers, input);
1501 end if;
1502 return not IO_buffers.InputEoF;
1503 end More_bytes;
1504
1505 -- LZ77 parameters
1506 Look_Ahead_LZ77 : constant Integer := 258;
1507 String_buffer_size : constant := 2**15; -- Required: 2**15 for Deflate, 2**16 for Deflate_e
1508 type Text_buffer_index is mod String_buffer_size;
1509 type Text_buffer is array (Text_buffer_index) of Byte;
1510 Text_Buf : Text_buffer;
1511 R : Text_buffer_index;
1512
1513 -- If the DLE coding doesn't fit the format constraints, we need
1514 -- to decode it as a simple sequence of literals. The buffer used is
1515 -- called "Text" buffer by reference to "clear-text", but actually it
1516 -- is any binary data.
1517
1518 procedure LZ77_emits_DL_code (distance, length : Integer) is
1519 -- NB: no worry, all arithmetics in Text_buffer_index are modulo String_buffer_size.
1520 b : Byte;
1521 copy_start : Text_buffer_index;
1522 expand : Expanded_data;
1523 ie : Positive := 1;
1524 begin
1525 if distance = String_buffer_size then -- Happens with 7-Zip, cannot happen with Info-Zip.
1526 copy_start := R;
1527 else
1528 copy_start := R - Text_buffer_index (distance);
1529 end if;
1530 -- Expand into the circular text buffer to have it up to date
1531 for K in 0 .. Text_buffer_index (length - 1) loop
1532 b := Text_Buf (copy_start + K);
1533 Text_Buf (R) := b;
1534 R := R + 1;
1535 if ie <= max_expand then -- Also memorize short sequences for LZ buffer
1536 expand (ie) := b; -- for the case a block needs to be stored in clear.
1537 ie := ie + 1;
1538 end if;
1539 end loop;
1540 if distance in Distance_range and length in Length_range then
1541 Put_or_delay_DL_code (distance, length, expand);
1542 else
1543 if trace then
1544 Put_Line (log,
1545 "<> Too bad, cannot encode this distance-length pair, " &
1546 "then we have to expand to output (dist = " & Integer'Image (distance) &
1547 " len=" & Integer'Image (length) & ")"
1548 );
1549 end if;
1550 for K in 0 .. Text_buffer_index (length - 1) loop
1551 Put_or_delay_literal_byte (Text_Buf (copy_start + K));
1552 end loop;
1553 end if;
1554 end LZ77_emits_DL_code;
1555
1556 procedure LZ77_emits_literal_byte (b : Byte) is
1557 begin
1558 Text_Buf (R) := b;
1559 R := R + 1;
1560 Put_or_delay_literal_byte (b);
1561 end LZ77_emits_literal_byte;
1562
1563 procedure Dummy_Estimate_DL_Codes (
1564 matches : in out LZ77.Matches_Array;
1565 old_match_index : in Natural;
1566 prefixes : in LZ77.Byte_Array;
1567 best_score_index : out Positive;
1568 best_score_set : out LZ77.Prefetch_Index_Type;
1569 match_trace : out LZ77.DLP_Array
1570 )
1571 is null;
1572
1573 LZ77_choice : constant array (Deflation_Method) of LZ77.Method_Type :=
1574 (Deflate_Fixed => LZ77.IZ_4,
1575 Deflate_0 => LZ77.No_LZ77,
1576 Deflate_1 => LZ77.IZ_6, -- level 6 is the default in Info-Zip's zip.exe
1577 Deflate_2 => LZ77.IZ_8,
1578 Deflate_3 => LZ77.IZ_10,
1579 Deflate_R => LZ77.Rich);
1580
1581 procedure My_LZ77 is
1582 new LZ77.Encode
1583 (String_buffer_size => String_buffer_size,
1584 Look_Ahead => Look_Ahead_LZ77,
1585 Threshold => 2, -- From a string match length > 2, a DL code is sent
1586 Method => LZ77_choice (method),
1587 Read_Byte => Read_byte,
1588 More_Bytes => More_bytes,
1589 Write_Literal => LZ77_emits_literal_byte,
1590 Write_DL_Code => LZ77_emits_DL_code,
1591 Estimate_DL_Codes => Dummy_Estimate_DL_Codes
1592 );
1593
1594 begin -- Encode
1595 Read_Block (IO_buffers, input);
1596 R := Text_buffer_index (String_buffer_size - Look_Ahead_LZ77);
1597 if input_size_known then
1598 feedback_milestone := Zip_Streams.ZS_Size_Type (input_size / feedback_steps);
1599 end if;
1600 case method is
1601 when Deflate_Fixed => -- "Fixed" (predefined) compression structure
1602 -- We have only one compressed data block, then it is already the last one.
1603 Put_Bits (code => 1, code_size => 1); -- Signals last block
1604 Put_Bits (code => 1, code_size => 2); -- Signals a "fixed" block
1605 when Taillaule_Deflation_Method =>
1606 null; -- No start data sent, all is delayed
1607 end case;
1608 ----------------------------------------------------------------
1609 -- The whole compression is happening in the following line: --
1610 ----------------------------------------------------------------
1611 My_LZ77;
1612 -- Done. Send the code signaling the end of compressed data block:
1613 case method is
1614 when Deflate_Fixed =>
1615 Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1616 when Taillaule_Deflation_Method =>
1617 if lz_buffer_index * 2 = 0 then -- Already flushed at latest Push, or empty data
1618 if block_to_finish and then last_block_type in fixed .. dynamic then
1619 Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1620 end if;
1621 else
1622 Flush_half_buffer (last_flush => True);
1623 if last_block_type in fixed .. dynamic then
1624 Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1625 end if;
1626 end if;
1627 if not last_block_marked then
1628 -- Add a fake fixed block, just to have a final block...
1629 Put_Bits (code => 1, code_size => 1); -- Signals last block
1630 Put_Bits (code => 1, code_size => 2); -- Signals a "fixed" block
1631 curr_descr := Deflate_fixed_descriptors;
1632 Put_Huffman_Code (curr_descr.lit_len (End_Of_Block));
1633 end if;
1634 end case;
1635 end Encode;
1636
1637 procedure Deallocation is
1638 begin
1639 Dispose (lz_buffer);
1640 Deallocate_Buffers (IO_buffers);
1641 end Deallocation;
1642
1643 begin
1644 if trace then
1645 begin
1646 Open (log, Append_File, log_name);
1647 exception
1648 when Name_Error =>
1649 Create (log, Out_File, log_name);
1650 end;
1651 Put (log, "New stream" & sep & sep & sep & sep & sep & sep & sep & sep);
1652 if input_size_known then
1653 Put (log, sep & Zip_64_Data_Size_Type'Image (input_size) &
1654 sep & sep & sep & sep & sep & sep & "bytes input");
1655 end if;
1656 New_Line (log);
1657 end if;
1658 Allocate_Buffers (IO_buffers, input_size_known, input_size);
1659 output_size := 0;
1660 lz_buffer := new Full_range_LZ_buffer_type;
1661 begin
1662 Encode;
1663 compression_ok := True;
1664 Flush_bit_buffer;
1665 Flush_byte_buffer;
1666 exception
1667 when Compression_inefficient => -- Escaped from Encode
1668 compression_ok := False;
1669 end;
1670 if trace then
1671 Close (log);
1672 end if;
1673 Deallocation;
1674 exception
1675 when others =>
1676 Deallocation;
1677 raise;
1678 end Zip.Compress.Deflate;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.