Source file : zip-compress-reduce.adb
1 -- Legal licensing note:
2
3 -- Copyright (c) 2009 .. 2023 Gautier de Montmollin
4 -- SWITZERLAND
5
6 -- Permission is hereby granted, free of charge, to any person obtaining a copy
7 -- of this software and associated documentation files (the "Software"), to deal
8 -- in the Software without restriction, including without limitation the rights
9 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 -- copies of the Software, and to permit persons to whom the Software is
11 -- furnished to do so, subject to the following conditions:
12
13 -- The above copyright notice and this permission notice shall be included in
14 -- all copies or substantial portions of the Software.
15
16 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 -- THE SOFTWARE.
23
24 -- NB: this is the MIT License, as found on the site
25 -- http://www.opensource.org/licenses/mit-license.php
26 --
27 -----------------
28 -- "Reduce" method - probabilistic reduction with a Markov chain.
29 -- See package specification for details.
30 --
31
32 -- Change log:
33 --
34 -- 7-Feb-2009: GdM: added a cache for LZ77 output to make 2nd phase faster
35
36 with LZ77;
37
38 with Ada.Text_IO;
39
40 procedure Zip.Compress.Reduce
41 (input,
42 output : in out Zip_Streams.Root_Zipstream_Type'Class;
43 input_size_known : Boolean;
44 input_size : Zip_64_Data_Size_Type; -- ignored if unknown
45 feedback : Feedback_Proc;
46 method : Reduction_Method;
47 CRC : in out Interfaces.Unsigned_32; -- only updated here
48 crypto : in out CRC_Crypto.Crypto_pack;
49 output_size : out Zip_64_Data_Size_Type;
50 compression_ok : out Boolean) -- indicates when compressed <= uncompressed
51 is
52 use Interfaces;
53
54 reduction_factor : constant Positive :=
55 1 + Compression_Method'Pos (method) - Compression_Method'Pos (Reduce_1);
56 use Zip_Streams;
57
58 DLE_code : constant := 144; -- "Escape" character, leading to a Distance - Length code.
59 subtype Symbol_range is Integer range 0 .. 255;
60 subtype Follower_range is Symbol_range range 0 .. 31;
61 -- PKWARE appnote.txt limits to 32 followers.
62 -- Above 32, you get "PKUNZIP: (W03) Warning! file has bad table"
63 -- Up to 63 is indeed possible and accepted by unzip <=5.12, WinZip
64 -- and of course Zip-Ada :-)
65 -- Optimum with 63 is extremely rare, the gain on test
66 -- files showing a 63 is 0.02%.
67 -- Then, we prefer compatibility here.
68
69 Followers : array (Symbol_range, Follower_range) of Symbol_range :=
70 (others => (others => 0));
71 Slen : array (Symbol_range) of Symbol_range;
72
73 -- Bits taken by (x-1) mod 256:
74 B_Table : constant array (Symbol_range) of Integer :=
75 (0 => 8,
76 1 .. 2 => 1,
77 3 .. 4 => 2,
78 5 .. 8 => 3,
79 9 .. 16 => 4,
80 17 .. 32 => 5,
81 33 .. 64 => 6,
82 65 .. 128 => 7,
83 129 .. 255 => 8);
84
85 ------------------
86 -- Buffered I/O --
87 ------------------
88
89 IO_buffers : IO_Buffers_Type;
90
91 procedure Put_byte (B : Byte) is
92 begin
93 IO_buffers.OutBuf (IO_buffers.OutBufIdx) := B;
94 IO_buffers.OutBufIdx := IO_buffers.OutBufIdx + 1;
95 if IO_buffers.OutBufIdx > IO_buffers.OutBuf'Last then
96 Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
97 end if;
98 end Put_byte;
99
100 --------------------------------------------------------------------------
101
102 -----------------
103 -- Code buffer --
104 -----------------
105
106 Save_byte : Byte; -- Output code buffer
107 Bits_used : Byte; -- Index into output code buffer
108
109 procedure Flush_output is
110 begin
111 if Bits_used /= 0 then
112 Put_byte (Save_byte);
113 end if;
114 if IO_buffers.OutBufIdx > 1 then
115 Write_Block (IO_buffers, input_size_known, input_size, output, output_size, crypto);
116 end if;
117 end Flush_output;
118
119 procedure Put_code (Code : Byte; Code_size : Natural) is
120 Code_work : Byte;
121 temp, Save_byte_local, Bits_used_local : Byte;
122 begin
123 Code_work := Code;
124 temp := 0;
125 Save_byte_local := Save_byte;
126 Bits_used_local := Bits_used;
127 for count in reverse 1 .. Code_size loop
128 temp := 0;
129 if Code_work mod 2 = 1 then
130 temp := temp + 1;
131 end if;
132 Code_work := Code_work / 2;
133 temp := Shift_Left (temp, Integer (Bits_used_local));
134 Bits_used_local := Bits_used_local + 1;
135 Save_byte_local := Save_byte_local or temp;
136 if Bits_used_local = 8 then
137 Put_byte (Save_byte_local);
138 Save_byte_local := 0;
139 temp := 0;
140 Bits_used_local := 0;
141 end if;
142 end loop;
143 Save_byte := Save_byte_local;
144 Bits_used := Bits_used_local;
145 end Put_code;
146
147 procedure Show_symbol (S : Symbol_range) is
148 begin
149 if S in 32 .. 126 then
150 Ada.Text_IO.Put (Character'Val (S));
151 else
152 Ada.Text_IO.Put ('{' & Symbol_range'Image (S) & '}');
153 end if;
154 end Show_symbol;
155
156 procedure Save_Followers is
157 begin
158 for X in reverse Symbol_range loop
159 Put_code (Byte (Slen (X)), 6); -- max 2**6 followers per symbol
160 for I in 0 .. Slen (X) - 1 loop
161 Put_code (Byte (Followers (X, I)), 8);
162 end loop;
163 end loop;
164 end Save_Followers;
165
166 ----------------------------------------
167 -- Probabilistic back-end compression --
168 ----------------------------------------
169
170 subtype Count is Integer_32;
171
172 markov_d : array (Symbol_range, Symbol_range) of Count :=
173 (others => (others => 0));
174
175 -- Build probability of transition from symbol i to symbol j:
176 --
177 -- markov(i,j) = P(symbol i is followed by j)
178 -- = markov_d(i,j) / sum_k( markov_d(i,k))
179 --
180 -- Only the discrete matrix markov_d is stored.
181
182 total_row : array (Symbol_range) of Count;
183 -- total_row(i) = sum_k( markov_d(i,k))
184
185 order : array (Symbol_range, Symbol_range) of Symbol_range;
186
187 use_probas : constant Boolean := True;
188 trace : constant Boolean := False;
189
190 -- We use the most significant quantiles of each row of the Markov matrix
191 -- to allow for a frequent coding that is shorter than the symbol itself.
192 -- Otherwise, why doing all that ;-) ?...
193
194 has_follower : array (Symbol_range, Symbol_range) of Boolean :=
195 (others => (others => False));
196 follower_pos : array (Symbol_range, Symbol_range) of Follower_range;
197
198 -- follower_pos(a,b) only significant if has_follower(a,b) = True
199
200 procedure Show_partial_markov (ordered : Boolean) is
201 subtype subrange is
202 Symbol_range range Character'Pos ('a') .. Character'Pos ('i');
203 sk, min, max : Symbol_range;
204 begin
205 if ordered then
206 min := Follower_range'First;
207 max := Follower_range'Last;
208 else
209 min := subrange'First;
210 max := subrange'Last;
211 end if;
212 Ada.Text_IO.New_Line;
213 for si in subrange loop
214 Show_symbol (si);
215 Ada.Text_IO.Put ("| ");
216 for sj in min .. max loop
217 if ordered then -- show top probas
218 sk := order (si, sj);
219 else -- show probas in the same subrange
220 sk := sj;
221 end if;
222 Show_symbol (sk);
223 Ada.Text_IO.Put (':' & Count'Image (markov_d (si, sj)) & ' ');
224 end loop;
225 if ordered then
226 Ada.Text_IO.Put ("|" & Count'Image (total_row (si)) & Integer'Image (Slen (si)));
227 end if;
228 Ada.Text_IO.New_Line;
229 end loop;
230 end Show_partial_markov;
231
232 procedure Build_Followers is
233
234 procedure Swap (a, b : in out Count) is
235 pragma Inline (Swap);
236 c : Count;
237 begin c := a; a := b; b := c; end Swap;
238
239 procedure Swap (a, b : in out Symbol_range) is
240 pragma Inline (Swap);
241 c : Symbol_range;
242 begin c := a; a := b; b := c; end Swap;
243
244 -- Sort a row (sym_row) in the Markov matrix, largest probabilities first.
245 -- Original order is kept by the order matrix.
246 --
247 procedure Sort (sym_row : Symbol_range) is
248 -- A stupid bubble sort algo, but one working with sets
249 -- having big packs of same data (trouble with qsort)...
250 swapped : Boolean;
251 left : Symbol_range := Symbol_range'First;
252 begin
253 loop
254 swapped := False;
255 for i in reverse left .. Symbol_range'Last - 1 loop
256 if markov_d (sym_row, i) < markov_d (sym_row, i + 1) then
257 Swap (markov_d (sym_row, i), markov_d (sym_row, i + 1));
258 Swap (order (sym_row, i), order (sym_row, i + 1));
259 swapped := True;
260 end if;
261 end loop;
262 exit when not swapped;
263 left := left + 1;
264 end loop;
265 end Sort;
266
267 cumul : Count;
268 follo : Symbol_range;
269
270 subtype Bit_range is Integer range 0 .. 5;
271 -- 6 would be possible, PKWARE appnote.txt limits number
272 -- of followers to 2**5
273
274 max_follo : constant array (Bit_range) of Integer :=
275 -- mostly 2**n - 1
276 -- actual follower range is: 0..max_follo(bits)
277 (0 => -1, -- NB: not 0; range is empty here
278 1 => 1,
279 2 => 3,
280 3 => 7,
281 4 => 15,
282 5 => 31
283 -- 6 => 62 -- NB: not 63
284 );
285
286 cumul_per_bit_length : array (Bit_range) of Count := (others => 0);
287
288 subtype Real is Long_Float;
289
290 exp_size, min_size : Real;
291 bits_min : Bit_range;
292
293 begin -- Build_Followers
294 if not use_probas then
295 Slen := (others => 0);
296 return;
297 end if;
298 if trace then
299 Show_partial_markov (False);
300 end if;
301 for si in Symbol_range loop
302 total_row (si) := 0;
303 for sj in Symbol_range loop
304 order (si, sj) := sj;
305 total_row (si) := total_row (si) + markov_d (si, sj);
306 end loop;
307 Sort (si);
308 cumul := 0;
309 -- Define all possible followers to symbol si:
310 for sj in Follower_range loop
311 cumul := cumul + markov_d (si, sj);
312 cumul_per_bit_length (B_Table (sj + 1)) := cumul;
313 -- ^ Overwritten several times. When sj jumps to the next bit length
314 -- (say, bl+1), cumul_per_bit_length(bl) contains the amount of symbols
315 -- for the stream to be encoded that can be emitted as immediate
316 -- successors to symbol si by using follower shortcuts of bit length bl.
317 follo := order (si, sj);
318 Followers (si, sj) := follo;
319 follower_pos (si, follo) := sj;
320 end loop;
321 -- Now we decide to which length we are using the followers
322 min_size := Real (total_row (si)) * 8.0;
323 -- ^ Size of all codes, in bits, when no follower
324 -- at all is defined for symbol si. In the next
325 -- loop, we will work to reduce that size.
326 bits_min := 0;
327 for bits in 1 .. Bit_range'Last loop
328 -- We compute the exact size of reduced output (entire file!) for
329 -- symbol si when cutting the follower range to 0..2**bits-1.
330 exp_size :=
331 -- Coded followers:
332 Real (cumul_per_bit_length (bits)) * Real (bits + 1) +
333 -- All codes outside the follower list will take 8+1 bits:
334 Real (total_row (si) - cumul_per_bit_length (bits)) * 9.0 +
335 -- Also the follower list at the beginning takes place:
336 Real (max_follo (bits) + 1) * 8.0;
337 if exp_size < min_size then
338 -- So far, it is more efficient to encode si's followers with 'bits' bits.
339 min_size := exp_size;
340 bits_min := bits;
341 end if;
342 end loop;
343 Slen (si) := max_follo (bits_min) + 1;
344 for sj in 0 .. max_follo (bits_min) loop
345 has_follower (si, Followers (si, sj)) := True;
346 end loop;
347 end loop;
348 if trace then
349 Show_partial_markov (True);
350 end if;
351 end Build_Followers;
352
353 --------------------------------
354 -- LZ77 front-end compression --
355 --------------------------------
356
357 -- Cache for LZ-compressed data, to speedup the 2nd phase:
358
359 LZ_cache_size : constant := 2**18; -- 256KB
360 type LZ_buffer_range is mod LZ_cache_size;
361 type LZ_buffer is array (LZ_buffer_range) of Byte; -- circular buffer
362
363 type LZ_cache_type is record
364 buf : LZ_buffer; -- buf's index arithmetic is mod LZ_cache_size
365 nxt : LZ_buffer_range := 0; -- position of next byte to be written
366 cnt : Natural := 0; -- [0..size]: count of cached bytes
367 end record;
368
369 LZ_cache : LZ_cache_type;
370 lz77_pos, lz77_size : Zip_64_Data_Size_Type := 0;
371
372 -- Possible ranges for LZ distance and length encoding
373 -- in the Zip-Reduce format:
374
375 subtype Length_range is
376 Integer range 4 .. 2**(8 - reduction_factor) + 257;
377
378 subtype Distance_range is
379 Integer range 1 .. (2**reduction_factor) * 256;
380
381 -- max length max dist
382 -- 1 385 512
383 -- 2 321 1024
384 -- 3 289 2048
385 -- 4 273 4096
386
387 type Phase_type is (compute_stats, compress_for_real);
388
389 generic
390 phase : Phase_type;
391 procedure Encode_with_Reduce;
392
393 procedure Encode_with_Reduce is
394 using_LZ77 : Boolean;
395 Derail_LZ77 : exception;
396 feedback_milestone,
397 Bytes_in : Zip_Streams.ZS_Size_Type := 0; -- Count of input file bytes processed
398 user_aborting : Boolean;
399 real_pct : constant array (Phase_type) of Integer := (0, 50);
400 PctDone : Natural;
401
402 function Read_byte return Byte is
403 b : Byte;
404 begin
405 b := IO_buffers.InBuf (IO_buffers.InBufIdx);
406 IO_buffers.InBufIdx := IO_buffers.InBufIdx + 1;
407 if phase = compute_stats then
408 Zip.CRC_Crypto.Update (CRC, (1 => b));
409 end if;
410 Bytes_in := Bytes_in + 1;
411 if feedback /= null then
412 if Bytes_in = 1 then
413 feedback (real_pct (phase), False, user_aborting);
414 end if;
415 if feedback_milestone > 0 and then
416 ((Bytes_in - 1) mod feedback_milestone = 0
417 or Bytes_in = ZS_Size_Type (input_size))
418 then
419 if input_size_known then
420 PctDone := real_pct (phase) + Integer ((50.0 * Float (Bytes_in)) / Float (input_size));
421 feedback (PctDone, False, user_aborting);
422 else
423 feedback (real_pct (phase), False, user_aborting);
424 end if;
425 if user_aborting then
426 raise User_abort;
427 end if;
428 end if;
429 end if;
430 return b;
431 end Read_byte;
432
433 function More_bytes return Boolean is
434 begin
435 if IO_buffers.InBufIdx > IO_buffers.MaxInBufIdx then
436 Read_Block (IO_buffers, input);
437 end if;
438 return not IO_buffers.InputEoF;
439 end More_bytes;
440
441 upper_shift : constant Integer := 2**(8 - reduction_factor);
442 maximum_len_1 : constant Integer := upper_shift - 1;
443 maximum_len_1_b : constant Byte := Byte (maximum_len_1);
444
445 -- LZ77 params
446 Look_redfac : constant array (1 .. 4) of Integer := (31, 63, 255, 191);
447 -- See za_work.xls, sheet Reduce, for the cooking of these numbers...
448 Look_Ahead : constant Integer := Look_redfac (reduction_factor);
449 String_buffer_size : constant := 2**12; -- 2**n optimizes "mod" to "and"
450 Threshold : constant := 3;
451
452 -- If the DLE coding doesn't fit the format constraints, we
453 -- need to decode it as a simple sequence of literals
454 -- before the probabilistic reduction.
455
456 type Text_Buffer is array (0 .. String_buffer_size + Look_Ahead - 1) of Byte;
457 Text_Buf : Text_Buffer;
458 R : Natural;
459
460 last_b : Symbol_range := 0;
461
462 -- Raw byte: post LZ77 / DLE coding, pre probabilistic reduction
463 procedure Write_raw_byte (b : Byte) is
464 curr_b : constant Symbol_range := Symbol_range (b);
465 follo : Boolean;
466 begin
467 lz77_pos := lz77_pos + 1;
468 case phase is
469 --
470 when compute_stats =>
471 markov_d (last_b, curr_b) := markov_d (last_b, curr_b) + 1;
472 -- We also feed the cache which will be read at the 2nd phase:
473 LZ_cache.buf (LZ_cache.nxt) := b;
474 LZ_cache.nxt := LZ_cache.nxt + 1;
475 LZ_cache.cnt := Natural'Min (LZ_cache_size, LZ_cache.cnt + 1);
476 when compress_for_real => -- Probabilistic reduction
477 if Slen (last_b) = 0 then
478 -- Follower set is empty for this character.
479 Put_code (b, 8);
480 else
481 follo := has_follower (last_b, curr_b);
482 Put_code (1 - Boolean'Pos (follo), 1);
483 -- ^ Certainly a weakness of this format is that each byte is preceded by
484 -- a flag signaling "clear text" or compressed.
485 if follo then
486 Put_code (Byte (follower_pos (last_b, curr_b)), B_Table (Slen (last_b)));
487 else
488 Put_code (b, 8);
489 end if;
490 end if;
491 end case;
492 last_b := curr_b;
493 if phase = compress_for_real and then
494 using_LZ77 and then
495 (lz77_size - lz77_pos) < Zip_64_Data_Size_Type (LZ_cache.cnt)
496 -- We have entered the zone covered by the cache, so no need
497 -- to continue the LZ77 compression effort: the results are
498 -- already stored.
499 then
500 raise Derail_LZ77;
501 -- We interrupt the LZ77 compression: data has been already
502 -- cached upon first pass (phase = stats), no need to redo it.
503 end if;
504 end Write_raw_byte;
505
506 -- The following procedures, Write_normal_byte and Write_DL_code,
507 -- are called by the LZ77 compressor
508
509 -- Write a normal, "clear-text", character
510 procedure Write_normal_byte (b : Byte) is
511 begin
512 Write_raw_byte (b);
513 if b = DLE_code then
514 -- disambiguate situation where the character happens to have
515 -- the same 'Pos as the DLE code
516 Write_raw_byte (0);
517 end if;
518 Text_Buf (R) := b;
519 R := (R + 1) mod String_buffer_size;
520 end Write_normal_byte;
521
522 -- Write a Distance-Length code
523 procedure Write_DL_code (distance, length : Integer) is
524 Copy_start : constant Natural := (R - distance) mod String_buffer_size;
525 len : constant Integer := length - 3;
526 dis : constant Integer := distance - 1;
527 dis_upper : Byte;
528 begin
529 if distance in Distance_range and length in Length_range then
530 Write_raw_byte (DLE_code);
531 dis_upper := Byte ((dis / 256) * upper_shift);
532 -- Encode length and upper part of distance
533 if len < maximum_len_1 then
534 Write_raw_byte (Byte (len) + dis_upper);
535 else
536 Write_raw_byte (maximum_len_1_b + dis_upper);
537 Write_raw_byte (Byte (len - maximum_len_1));
538 end if;
539 -- Encode distance
540 Write_raw_byte (Byte (dis mod 256));
541 -- Expand in the circular text buffer to have it up to date
542 for K in 0 .. length - 1 loop
543 Text_Buf (R) := Text_Buf ((Copy_start + K) mod String_buffer_size);
544 R := (R + 1) mod String_buffer_size;
545 end loop;
546 else
547 -- Cannot encode this distance-length pair, then expand to output :-(
548 -- if phase= compress then Put("Aie! (" & distance'img & length'img & ")"); end if;
549 for K in 0 .. length - 1 loop
550 Write_normal_byte (Text_Buf ((Copy_start + K) mod String_buffer_size));
551 end loop;
552 end if;
553 end Write_DL_code;
554
555 procedure Dummy_Estimate_DL_Codes (
556 matches : in out LZ77.Matches_Array;
557 old_match_index : in Natural;
558 prefixes : in LZ77.Byte_Array;
559 best_score_index : out Positive;
560 best_score_set : out LZ77.Prefetch_Index_Type;
561 match_trace : out LZ77.DLP_Array
562 )
563 is null;
564
565 procedure My_LZ77 is
566 new LZ77.Encode
567 (String_buffer_size => String_buffer_size,
568 Look_Ahead => Look_Ahead,
569 Threshold => Threshold,
570 Method => LZ77.LZHuf,
571 -- NB: Method IZ_9 needs exactly the same set of LZ77 parameters as in
572 -- Deflate. Then the compression is worse, though much faster.
573 Read_Byte => Read_byte,
574 More_Bytes => More_bytes,
575 Write_Literal => Write_normal_byte,
576 Write_DL_Code => Write_DL_code,
577 Estimate_DL_Codes => Dummy_Estimate_DL_Codes);
578
579 procedure Finish_Cache is
580 i : LZ_buffer_range := LZ_buffer_range (lz77_pos mod LZ_cache_size);
581 begin
582 while lz77_pos < lz77_size loop
583 Write_raw_byte (LZ_cache.buf (i));
584 i := i + 1;
585 end loop;
586 end Finish_Cache;
587
588 begin -- Encode_with_Reduce
589 Read_Block (IO_buffers, input);
590 R := String_buffer_size - Look_Ahead;
591 if input_size_known then
592 feedback_milestone := ZS_Size_Type (input_size / feedback_steps);
593 end if;
594 using_LZ77 := True;
595 My_LZ77;
596 exception
597 when Derail_LZ77 => -- LZ77 compression interrupted because compressed data already cached
598 using_LZ77 := False;
599 Finish_Cache;
600 if feedback /= null then
601 feedback (100, False, user_aborting);
602 end if;
603 end Encode_with_Reduce;
604
605 procedure Build_stats is new Encode_with_Reduce (phase => compute_stats);
606 procedure Compress_data is new Encode_with_Reduce (phase => compress_for_real);
607
608 mem : ZS_Index_Type;
609
610 begin
611 Allocate_Buffers (IO_buffers, input_size_known, input_size);
612 output_size := 0;
613 mem := Index (input);
614 -- Pass 1: statistics to calibrate the probabilistic expansion
615 Build_stats;
616 Set_Index (input, mem); -- go back to beginning of message to compress
617 Build_Followers;
618 -- Pass 2: actual compression
619 Save_byte := 0; -- Initialize output bit buffer
620 Bits_used := 0;
621 Save_Followers; -- Emit the compression structure before the compressed message
622 lz77_size := lz77_pos;
623 lz77_pos := 0;
624 begin
625 Compress_data; -- Emit the compressed message
626 Flush_output;
627 compression_ok := True;
628 exception
629 when Compression_inefficient =>
630 compression_ok := False;
631 end;
632 Deallocate_Buffers (IO_buffers);
633 exception
634 when others =>
635 Deallocate_Buffers (IO_buffers);
636 raise;
637 end Zip.Compress.Reduce;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.