Source file : bzip2-encoding.adb
1 -- BZip2.Encoding - a standalone, generic BZip2 encoder.
2 ------------------
3 --
4 -- Examples of use:
5 -- BZip2_Enc, a standalone encoder to .bz2 files
6 -- Zip.Compress.BZip2_E, creates Zip files entries with BZip2 encoding
7
8 -- Legal licensing note:
9
10 -- Copyright (c) 2024 .. 2025 Gautier de Montmollin
11 -- SWITZERLAND
12
13 -- Permission is hereby granted, free of charge, to any person obtaining a copy
14 -- of this software and associated documentation files (the "Software"), to deal
15 -- in the Software without restriction, including without limitation the rights
16 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 -- copies of the Software, and to permit persons to whom the Software is
18 -- furnished to do so, subject to the following conditions:
19
20 -- The above copyright notice and this permission notice shall be included in
21 -- all copies or substantial portions of the Software.
22
23 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 -- THE SOFTWARE.
30
31 -- NB: this is the MIT License, as found on the site
32 -- http://www.opensource.org/licenses/mit-license.php
33
34 -----------------
35
36 -- To do:
37 --
38 -- - Performance: use Suffix-Array-Induced-Sorting for the BWT.
39 -- See https://github.com/dsnet/compress/blob/master/bzip2/bwt.go
40 -- https://sites.google.com/site/yuta256/sais
41 -- - Segmentation: brute-force recursive binary segmentation as in EncodeBlock2 in
42 -- 7-Zip's BZip2Encoder.cpp .
43 -- - Use tasking to parallelize the block compression jobs.
44 --
45 -- Already tried without significant success:
46 --
47 -- - Use the permutation of entropy coders that minimizes the
48 -- size of compression structure.
49 -- - Brute-force over different strategies to tweak frequencies for avoiding
50 -- zero occurrences (see Avoid_Zeros). Unfortunately, the gains are offset by larger
51 -- compression structures (the Huffman trees descriptors take more room).
52 -- - Use k-means machine learning method to re-allocate clusters to entropy coders.
53 -- Removed from code on 2025-02-23.
54 -- - Use a "noisiness" (instead of a "bumpiness") function of a group's frequencies
55 -- in the sorting key for the initial clustering.
56 -- - Set up the initial clustering by slicing the global frequency histogram
57 -- "horizontally" (on the symbol axis) to create artificial truncated histograms
58 -- and allocate them to the data groups. It obviously traps the model into
59 -- a suboptimal local optimum in the 258-dimensional criterion space.
60 -- This method is used by the original BZip2 program.
61 -- Removed from code on 2025-02-08.
62
63 with Data_Segmentation;
64 with Huffman.Encoding.Length_Limited_Coding;
65
66 with Ada.Containers.Generic_Constrained_Array_Sort,
67 Ada.Strings.Unbounded,
68 Ada.Text_IO,
69 Ada.Unchecked_Deallocation;
70
71 package body BZip2.Encoding is
72
73 procedure Encode
74 (option : Compression_Option := block_900k;
75 size_hint : Stream_Size_Type := unknown_size)
76 is
77 use Interfaces;
78
79 subtype Bit_Pos_Type is Natural range 0 .. 7;
80 bit_buffer : Byte := 0;
81 bit_pos : Bit_Pos_Type := 7;
82
83 procedure Flush_Bit_Buffer is
84 begin
85 Write_Byte (bit_buffer);
86 bit_buffer := 0;
87 bit_pos := 7;
88 end Flush_Bit_Buffer;
89
90 procedure Put_Bits (data : Unsigned_32; amount : Positive) is
91 begin
92 for count in reverse 1 .. amount loop
93 if (data and Shift_Left (Unsigned_32'(1), count - 1)) /= 0 then
94 bit_buffer := bit_buffer or Shift_Left (Unsigned_8'(1), bit_pos);
95 end if;
96 if bit_pos = 0 then
97 Flush_Bit_Buffer;
98 else
99 bit_pos := bit_pos - 1;
100 end if;
101 end loop;
102 end Put_Bits;
103
104 procedure Put_Bits (b : Boolean) is
105 begin
106 Put_Bits (Boolean'Pos (b), 1);
107 end Put_Bits;
108
109 procedure Put_Bits (s : String) is
110 begin
111 for c of s loop
112 Put_Bits (Character'Pos (c), 8);
113 end loop;
114 end Put_Bits;
115
116 level : constant Natural_32 :=
117 (case option is
118 when block_100k => 1,
119 when block_400k => 4,
120 when block_900k => 9);
121
122 block_capacity : constant Natural_32 := sub_block_size * level;
123
124 -- We use in this package 4 large heap-allocated arrays.
125 -- It is possible to use Ada.Containers.Vectors but the run time
126 -- is longer, possibly due to indirect access to data and various
127 -- checks. For instance, replacing Buffer below with a Vector makes
128 -- the encoding ~26% slower.
129
130 type Buffer is array (Natural_32 range <>) of Byte;
131 type Buffer_Access is access Buffer;
132
133 procedure Unchecked_Free is new Ada.Unchecked_Deallocation (Buffer, Buffer_Access);
134
135 combined_crc : Unsigned_32 := 0;
136
137 block_counter : Natural := 0;
138
139 quiet : constant := 0;
140 headlines : constant := 1;
141 detailed : constant := 2;
142 super_detailed : constant := 3; -- Details down to symbols.
143
144 verbosity_level : constant := quiet;
145
146 procedure Trace (msg : String; verbosity : Natural) with Inline is
147 begin
148 if verbosity_level >= verbosity then
149 Ada.Text_IO.Put_Line ("BZip2: " & msg);
150 end if;
151 end Trace;
152
153 procedure Trace (prefix : String; b : Buffer; verbosity : Natural) with Inline is
154 begin
155 if verbosity_level >= verbosity then
156 declare
157 use Ada.Strings.Unbounded;
158 msg : Unbounded_String;
159 begin
160 for bt of b loop
161 if bt in 32 .. 126 then
162 msg := msg & Character'Val (bt);
163 else
164 msg := msg & '(' & bt'Image & ')';
165 end if;
166 end loop;
167 Trace (prefix & To_String (msg), verbosity);
168 end;
169 end if;
170 end Trace;
171
172 -- Each block is limited either by the data available
173 -- by the block capacity.
174 -- It means that each encoding step has an end and
175 -- that we can theoretically go on with the next step,
176 -- perhaps at the price of using more memory.
177
178 procedure Encode_Block (raw_buf : Buffer) is
179
180 -----------------------------------
181 -- Initial Run-Length Encoding --
182 -----------------------------------
183
184 rle_1_block_size : Natural_32 := 0;
185 block_crc : Unsigned_32;
186 in_use : array (Byte) of Boolean := (others => False);
187
188 rle_1_data : Buffer_Access := new Buffer (1 .. block_capacity * 5 / 4);
189 -- Worst case: all data consist of runs of 4 bytes -> 5 bytes with RLE_1.
190
191 procedure RLE_1 is
192 b_prev : Byte := 0; -- Initialization is to reassure the compiler.
193 run : Natural := 0;
194
195 procedure Store_Run with Inline is
196 procedure Store (x : Byte) with Inline is
197 begin
198 rle_1_block_size := rle_1_block_size + 1;
199 rle_1_data (rle_1_block_size) := x;
200 in_use (x) := True;
201 end Store;
202 begin
203 for count in 1 .. Integer'Min (4, run) loop
204 Store (b_prev);
205 end loop;
206 if run >= 4 then
207 pragma Assert (run <= 259);
208 Store (Byte (run - 4));
209 end if;
210 run := 1;
211 end Store_Run;
212
213 start : Boolean := True;
214 begin
215 CRC.Init (block_crc);
216 for b of raw_buf loop
217 CRC.Update (block_crc, b);
218 if start or else b /= b_prev then
219 -- Startup or Run break:
220 Store_Run;
221 start := False;
222 elsif run = 259 then
223 -- Force a run break, even though b = b_prev:
224 Store_Run;
225 else
226 run := run + 1;
227 end if;
228 b_prev := b;
229 end loop;
230 Store_Run;
231 Trace ("RLE_1: raw buffer length: " & raw_buf'Length'Image, headlines);
232 Trace ("RLE_1-processed block size:" & rle_1_block_size'Image, headlines);
233 if verbosity_level >= super_detailed then
234 Trace ("RLE_1: ", rle_1_data (1 .. rle_1_block_size), super_detailed);
235 end if;
236 end RLE_1;
237
238 ---------------------------------
239 -- Burrows-Wheeler Transform --
240 ---------------------------------
241
242 bwt_data : Buffer_Access;
243 bwt_index : Natural_32 := 0; -- 0-based.
244
245 procedure BWT is
246
247 subtype Offset_Range is Integer_32 range 0 .. rle_1_block_size - 1;
248
249 type Offset_Table is array (Offset_Range) of Offset_Range;
250 type Offset_Table_Access is access Offset_Table;
251
252 procedure Unchecked_Free is new Ada.Unchecked_Deallocation (Offset_Table, Offset_Table_Access);
253
254 -- Compare the message, rotated with two different offsets.
255 function Lexicographically_Smaller (left, right : Offset_Range) return Boolean with Inline is
256 il, ir : Integer_32;
257 l, r : Byte;
258 begin
259 pragma Assert (rle_1_data'First = 1);
260 il := 1 + (if left = 0 then 0 else rle_1_block_size - left);
261 ir := 1 + (if right = 0 then 0 else rle_1_block_size - right);
262 for i in Offset_Range loop
263 l := rle_1_data (il);
264 r := rle_1_data (ir);
265 if l < r then
266 return True;
267 elsif l > r then
268 return False;
269 end if;
270 il := il + 1;
271 if il > rle_1_block_size then
272 il := 1;
273 end if;
274 ir := ir + 1;
275 if ir > rle_1_block_size then
276 ir := 1;
277 end if;
278 end loop;
279 -- Equality in contents.
280 return left < right; -- Ensures stable sorting.
281 end Lexicographically_Smaller;
282
283 procedure Offset_Sort is new Ada.Containers.Generic_Constrained_Array_Sort
284 (Index_Type => Offset_Range,
285 Element_Type => Offset_Range,
286 Array_Type => Offset_Table,
287 "<" => Lexicographically_Smaller);
288
289 offset : Offset_Table_Access := new Offset_Table;
290
291 begin
292 for i in Offset_Range loop
293 offset (i) := i;
294 end loop;
295
296 Offset_Sort (offset.all); -- <--- The BW Transform is done here.
297
298 bwt_data := new Buffer (1 .. rle_1_block_size);
299 for i in Offset_Range loop
300 -- Copy last column of the matrix into transformed message:
301 bwt_data (1 + i) := rle_1_data (1 + (rle_1_block_size - 1 - offset (i)) mod rle_1_block_size);
302 if offset (i) = 0 then
303 -- Found the row index of the original message.
304 bwt_index := i;
305 end if;
306 end loop;
307
308 if verbosity_level >= super_detailed then
309 if rle_1_block_size = 0 then
310 Trace ("BWT: (empty block)", super_detailed);
311 else
312 Trace ("BWT: ", bwt_data.all, super_detailed);
313 Trace ("BWT index:" & bwt_index'Image, super_detailed);
314 end if;
315 end if;
316 Unchecked_Free (offset);
317 Unchecked_Free (rle_1_data);
318 exception
319 when others =>
320 Unchecked_Free (offset);
321 raise;
322 end BWT;
323
324 ----------------------------------------------------
325 -- Move-to-Front and second Run-Length Encoding --
326 ----------------------------------------------------
327
328 subtype Max_Alphabet is Integer range 0 .. max_alphabet_size - 1;
329
330 type MTF_Array is array (Positive_32 range <>) of Max_Alphabet;
331 type MTF_Array_Access is access MTF_Array;
332
333 procedure Unchecked_Free is new Ada.Unchecked_Deallocation (MTF_Array, MTF_Array_Access);
334
335 mtf_data : MTF_Array_Access;
336 mtf_last : Natural_32 := 0;
337
338 unseq_to_seq : array (Byte) of Byte;
339
340 normal_symbols_in_use : Natural;
341 last_symbol_in_use : Natural;
342 EOB : Natural;
343
344 procedure MTF_and_RLE_2 is
345
346 procedure Prepare_Mapping is
347 begin
348 normal_symbols_in_use := 0;
349 for i in Byte loop
350 if in_use (i) then
351 unseq_to_seq (i) := Byte (normal_symbols_in_use);
352 normal_symbols_in_use := normal_symbols_in_use + 1;
353 end if;
354 end loop;
355
356 last_symbol_in_use := normal_symbols_in_use + 3 - 1 - 1;
357 -- ^ + 3 : the special symbols RUN_A, RUN_B, EOB
358 -- - 1 : value 0 has no symbol (RUN_A and RUN_B are used for the runs of 0)
359 -- - 1 : zero-based
360
361 EOB := last_symbol_in_use;
362
363 Trace ("Normal symbols in use:" & normal_symbols_in_use'Image, detailed);
364 end Prepare_Mapping;
365
366 procedure Store (a : Max_Alphabet) is
367 begin
368 mtf_last := mtf_last + 1;
369 mtf_data (mtf_last) := a;
370 end Store;
371
372 run : Natural_32 := 0;
373
374 procedure Store_Run with Inline is
375 rc : Unsigned_32;
376 begin
377 if run > 0 then
378 -- Output a binary representation of `run`
379 -- using RUN_A for 0's RUN_B for 1's.
380 rc := Unsigned_32 (run + 1);
381 loop
382 Store (Max_Alphabet (rc and 1));
383 rc := Shift_Right (rc, 1);
384 exit when rc < 2;
385 end loop;
386 -- Reset the run count.
387 run := 0;
388 end if;
389 end Store_Run;
390
391 mtf_symbol : array (0 .. 255) of Byte;
392 idx : Natural;
393 bt_seq : Byte;
394
395 begin
396 Prepare_Mapping;
397
398 mtf_data := new MTF_Array (1 .. 1 + 2 * rle_1_block_size);
399
400 for i in mtf_symbol'Range loop
401 mtf_symbol (i) := Byte (i);
402 end loop;
403
404 Big_MTF_RLE_2_Loop :
405 for bt of bwt_data.all loop
406 bt_seq := unseq_to_seq (bt);
407
408 -- MTF part:
409
410 Search_Value :
411 for search in mtf_symbol'Range loop
412 if mtf_symbol (search) = bt_seq then
413 idx := search;
414 exit Search_Value;
415 end if;
416 end loop Search_Value;
417
418 Move_Value_to_Front :
419 for i in reverse 1 .. idx loop
420 mtf_symbol (i) := mtf_symbol (i - 1);
421 end loop Move_Value_to_Front;
422 mtf_symbol (0) := bt_seq;
423
424 -- RLE part:
425
426 if idx = 0 then
427 run := run + 1;
428 else
429 Store_Run;
430 Store (1 + idx); -- Value stored is >= 2. Values 0 and 1 are RUN_A, RUN_B.
431 end if;
432
433 end loop Big_MTF_RLE_2_Loop;
434
435 Store_Run;
436 Store (EOB);
437
438 Unchecked_Free (bwt_data);
439 end MTF_and_RLE_2;
440
441 ----------------------
442 -- Entropy Coding --
443 ----------------------
444
445 subtype Entropy_Coder_Range is Integer range 1 .. max_entropy_coders;
446
447 descr :
448 array (Entropy_Coder_Range) of
449 Huffman.Encoding.Descriptor (Max_Alphabet);
450
451 entropy_coder_count : Entropy_Coder_Range;
452 selector_count : Integer_32;
453
454 selector : array (1 .. 1 + block_capacity / group_size) of Entropy_Coder_Range;
455
456 procedure Entropy_Calculations is
457
458 subtype Alphabet_in_Use is Integer range 0 .. last_symbol_in_use;
459
460 type Count_Array is array (Alphabet_in_Use) of Natural_32;
461
462 procedure Avoid_Zeros (freq : in out Count_Array) is
463 zeroes : Natural := 0;
464 begin
465 for stat of freq loop
466 if stat = 0 then
467 zeroes := zeroes + 1;
468 end if;
469 end loop;
470 case zeroes is
471 when 0 =>
472 -- Zero zeroes, zero problem :-)
473 null;
474 when 1 .. 100 =>
475 -- Turn the "0"'s into "1"'s.
476 for stat of freq loop
477 stat := Natural_32'Max (1, stat);
478 end loop;
479 when others =>
480 -- Turn the "0"'s into an actual "1/2".
481 for stat of freq loop
482 stat := (if stat = 0 then 1 else stat * 2);
483 end loop;
484 end case;
485 end Avoid_Zeros;
486
487 max_code_len : Positive;
488
489 procedure Output_Frequency_Matrix is
490 use Ada.Text_IO;
491 f : File_Type;
492 file_name : String := "freq" & block_counter'Image & ".csv";
493 freq : Count_Array := (others => 0);
494 symbol : Alphabet_in_Use;
495 sep : constant Character := ';';
496 begin
497 -- In this file, rows represent groups of data,
498 -- columns represent the frequencies of each symbol.
499 file_name (file_name'First + 4) := '_';
500 Create (f, Out_File, file_name);
501 for mtf_idx in 1 .. mtf_last loop
502 symbol := mtf_data (mtf_idx);
503 freq (symbol) := freq (symbol) + 1;
504 if mtf_idx rem group_size = 0 or else mtf_idx = mtf_last then
505 -- Dump group's statistics:
506 for s in Alphabet_in_Use loop
507 Put (f, freq (s)'Image & sep);
508 end loop;
509 New_Line (f);
510 freq := (others => 0);
511 end if;
512 end loop;
513 Close (f);
514 end Output_Frequency_Matrix;
515
516 type Huffman_Length_Array is array (Alphabet_in_Use) of Natural;
517
518 procedure Define_Descriptor (freq : in out Count_Array; des : Entropy_Coder_Range) is
519 procedure LLHCL is new
520 Huffman.Encoding.Length_Limited_Coding
521 (Alphabet => Alphabet_in_Use,
522 Count_Type => Natural_32,
523 Count_Array => Count_Array,
524 Length_Array => Huffman_Length_Array,
525 max_bits => max_code_len);
526 len : Huffman_Length_Array;
527 pragma Assert (max_code_len <= max_code_len_bzip2_1_0_2);
528 begin
529 Avoid_Zeros (freq);
530 LLHCL (freq, len);
531 for symbol in Alphabet_in_Use loop
532 descr (des)(symbol).bit_length := len (symbol);
533 end loop;
534 Huffman.Encoding.Prepare_Codes
535 (descr (des)(Alphabet_in_Use), max_code_len, False);
536 end Define_Descriptor;
537
538 -----------------------------------------------------
539 -- Radically simple but functional entropy coder --
540 -----------------------------------------------------
541
542 procedure Single_Entropy_Coder is
543 freq : Count_Array := (others => 0);
544 begin
545 for symbol of mtf_data (1 .. mtf_last) loop
546 freq (symbol) := freq (symbol) + 1;
547 end loop;
548 max_code_len := max_code_len_bzip2_1_0_3;
549 Define_Descriptor (freq, 1);
550 entropy_coder_count := 2; -- The canonical BZip2 decoder requires >= 2 coders.
551 descr (2) := descr (1); -- We actually don't use the copy (psssht), but need to define it!
552 for i in 1 .. selector_count loop
553 selector (Integer_32 (i)) := 1;
554 end loop;
555 end Single_Entropy_Coder;
556
557 --------------------------------------------------------------------------
558 -- Define multiple entropy coders (max 6) and assign them to the --
559 -- various groups of data (max 18000, with 50 symbols in each group). --
560 -- The art is to gather the groups into meaningful clusters. --
561 -- Each cluster will use one of the entropy coders. --
562 --------------------------------------------------------------------------
563
564 procedure Multiple_Entropy_Coders is
565
566 subtype Selector_Range is Positive_32 range 1 .. selector_count;
567
568 -- Create an initial clustering depending on a ranking using a key
569 -- computed on a subset of the alphabet (a mix of RUN_A, RUN_B,
570 -- low-index MTF values).
571 -- Look at the first few columns of some output
572 -- of Output_Frequency_Matrix to find why.
573 --
574 procedure Initial_Clustering_Ranking_Method (sample_width : Positive) is
575
576 type Pair is record
577 key : Natural_32; -- Occurrences of symbols that are in the subset.
578 index : Positive_32; -- Group number.
579 end record;
580
581 type Ranking_Array is array (Selector_Range) of Pair;
582
583 ranking : Ranking_Array;
584
585 function Smaller_Key (left, right : Pair) return Boolean is (left.key < right.key);
586
587 procedure Ranking_Sort is new Ada.Containers.Generic_Constrained_Array_Sort
588 (Index_Type => Selector_Range,
589 Element_Type => Pair,
590 Array_Type => Ranking_Array,
591 "<" => Smaller_Key);
592
593 type Cluster_Attribution is array (Positive range <>) of Entropy_Coder_Range;
594
595 procedure Initial_Clustering_by_Rank (attr : Cluster_Attribution) is
596 na : constant Positive_32 := attr'Length;
597 ns : constant Selector_Range := selector_count;
598 a32 : Positive_32;
599 begin
600 for attr_idx in attr'Range loop
601 a32 := Integer_32 (attr_idx) - Integer_32 (attr'First) + 1; -- a32 = 1, 2, 3, .. na.
602 for i in 1 + (a32 - 1) * ns / na .. a32 * ns / na loop
603 selector (ranking (i).index) := attr (attr_idx);
604 end loop;
605 end loop;
606 end Initial_Clustering_by_Rank;
607
608 pos_countdown : Natural := group_size;
609 sel_idx : Positive_32 := 1;
610 key : Natural_32 := 0;
611 last_symbol_sampled : constant Natural := Integer'Min (EOB - 1, run_a + sample_width - 1);
612 symbol : Alphabet_in_Use;
613 begin
614 -- Populate the frequency stats for the ranking, grouped by data group.
615 --
616 for mtf_idx in 1 .. mtf_last loop
617 symbol := mtf_data (mtf_idx);
618 if symbol in run_a .. last_symbol_sampled then
619 key := key + 1;
620 end if;
621 pos_countdown := pos_countdown - 1;
622 if pos_countdown = 0 then
623 ranking (sel_idx) := (key => key, index => sel_idx);
624 pos_countdown := group_size;
625 sel_idx := sel_idx + 1;
626 key := 0;
627 end if;
628 end loop;
629 if pos_countdown < group_size then
630 -- Finish last, incomplete group.
631 ranking (sel_idx) := (key => key, index => sel_idx);
632 end if;
633
634 -- The construction of initial clusters can now be
635 -- done easily using the following sorting:
636 --
637 Ranking_Sort (ranking);
638
639 -- Example with two clusters:
640 -- - Low values (more random data) for the cluster #2.
641 -- - High values (more redundant data) for #1.
642 --
643 case entropy_coder_count is
644 when 1 => null; -- Not supported by canonical BZip2.
645 when 2 => Initial_Clustering_by_Rank ((2, 1));
646 when 3 => Initial_Clustering_by_Rank ((3, 1, 2));
647 when 4 => Initial_Clustering_by_Rank ((4, 2, 1, 3));
648 when 5 => Initial_Clustering_by_Rank ((5, 3, 1, 2, 4));
649 when 6 => Initial_Clustering_by_Rank ((6, 4, 2, 1, 3, 5));
650 end case;
651
652 end Initial_Clustering_Ranking_Method;
653
654 procedure Define_Descriptors is
655 pos_countdown : Natural := group_size;
656 selector_idx : Positive_32 := 1;
657 cluster : Entropy_Coder_Range := selector (1);
658 symbol : Alphabet_in_Use;
659 freq_cluster : array (1 .. entropy_coder_count) of Count_Array := (others => (others => 0));
660 begin
661 -- Populate the frequency stats, grouped by cluster (= entropy coder choice):
662 for mtf_idx in 1 .. mtf_last loop
663 symbol := mtf_data (mtf_idx);
664 freq_cluster (cluster)(symbol) := freq_cluster (cluster)(symbol) + 1;
665 pos_countdown := pos_countdown - 1;
666 if pos_countdown = 0 and then mtf_idx < mtf_last then
667 pos_countdown := group_size;
668 selector_idx := selector_idx + 1;
669 cluster := selector (selector_idx);
670 end if;
671 end loop;
672 -- Create Huffman codes based on the said frequencies:
673 for cl in 1 .. entropy_coder_count loop
674 Define_Descriptor (freq_cluster (cl), cl);
675 end loop;
676 end Define_Descriptors;
677
678 defector_groups : Natural;
679
680 procedure Simulate_Entropy_Coding_Variants_and_Reclassify is
681 pos_countdown : Natural := group_size;
682 selector_idx : Positive_32 := 1;
683 cluster : Entropy_Coder_Range := selector (1);
684 symbol : Alphabet_in_Use;
685 bit_count : array (1 .. entropy_coder_count) of Natural := (others => 0);
686
687 -- We simulate the encoding of selectors (for its cost).
688 mtf_cluster_value : array (1 .. entropy_coder_count) of Positive;
689 mtf_cluster_idx : Positive;
690
691 procedure Optimize_Group is
692 min_bits : Natural := Natural'Last;
693 best : Entropy_Coder_Range := cluster;
694 cost : Natural;
695 begin
696 -- At this point we have computed the costs in bits for
697 -- encoding the current group of data using various entropy coders.
698 -- Now we look at the extra cost of switching entropy coders.
699 for cl in 1 .. entropy_coder_count loop
700 cost := bit_count (cl);
701 -- Here we account the mtf encoding of the selectors.
702 for search in mtf_cluster_value'Range loop
703 if mtf_cluster_value (search) = cl then
704 mtf_cluster_idx := search;
705 exit;
706 end if;
707 end loop;
708 cost := cost + mtf_cluster_idx;
709 --
710 if cost < min_bits then
711 -- Encoder #cl is cheaper.
712 min_bits := cost;
713 best := cl;
714 end if;
715 end loop;
716
717 if best /= cluster then
718 -- We have found a cheaper encoding by switching to another cluster.
719 -- -> the group #sel_idx changes party (re-allocation).
720 selector (selector_idx) := best;
721 defector_groups := defector_groups + 1;
722 end if;
723
724 -- Now do the "definitive" (but still simulated)
725 -- mtf for the chosen cluster index.
726 for search in mtf_cluster_value'Range loop
727 if mtf_cluster_value (search) = selector (selector_idx) then
728 mtf_cluster_idx := search;
729 exit;
730 end if;
731 end loop;
732 -- Move the value to the first place.
733 for j in reverse 2 .. mtf_cluster_idx loop
734 mtf_cluster_value (j) := mtf_cluster_value (j - 1);
735 end loop;
736 mtf_cluster_value (1) := selector (selector_idx);
737 end Optimize_Group;
738
739 begin
740 -- Cost analysis by simulation and re-classification
741 -- (or said otherwise, re-allocation).
742 --
743 for w in mtf_cluster_value'Range loop
744 -- We start with 1, 2, 3, ...:
745 mtf_cluster_value (w) := w;
746 end loop;
747
748 defector_groups := 0;
749 pos_countdown := group_size;
750 for mtf_idx in 1 .. mtf_last loop
751 symbol := mtf_data (mtf_idx);
752 for cl in 1 .. entropy_coder_count loop
753 -- For each cluster cl, simulate output assuming
754 -- the current group belongs to cluster cl.
755 bit_count (cl) := bit_count (cl) + descr (cl) (symbol).bit_length;
756 end loop;
757 pos_countdown := pos_countdown - 1;
758 if pos_countdown = 0 then
759 Optimize_Group;
760 pos_countdown := group_size;
761 if mtf_idx < mtf_last then
762 bit_count := (others => 0);
763 selector_idx := selector_idx + 1;
764 cluster := selector (selector_idx);
765 end if;
766 end if;
767 end loop;
768 if pos_countdown < group_size then
769 -- Optimize last, incomplete group.
770 Optimize_Group;
771 end if;
772 end Simulate_Entropy_Coding_Variants_and_Reclassify;
773
774 low_cluster_usage : Boolean := False;
775
776 procedure Cluster_Statistics is
777 stat_cluster : array (Entropy_Coder_Range) of Natural_32 := (others => 0);
778 cl : Entropy_Coder_Range;
779 uniform_usage : constant Natural_32 := selector_count / Natural_32 (entropy_coder_count);
780 threshold_denominator : constant := 2;
781 begin
782 low_cluster_usage := False;
783 -- Compute cluster usage.
784 for i in Selector_Range loop
785 cl := selector (i);
786 stat_cluster (cl) := stat_cluster (cl) + 1;
787 end loop;
788 for c in 1 .. entropy_coder_count loop
789 Trace
790 (" Cluster" & c'Image & " is used by" &
791 stat_cluster (c)'Image & " groups.", detailed);
792 if stat_cluster (c) < uniform_usage / threshold_denominator then
793 low_cluster_usage := True;
794 Trace (" ---> Low Cluster Usage!", detailed);
795 end if;
796 end loop;
797 end Cluster_Statistics;
798
799 procedure Construct (sample_width : Natural) is
800 reclassification_iteration_limit : constant := 10;
801 begin
802 Initial_Clustering_Ranking_Method (sample_width);
803 Trace
804 (" Construct with" & entropy_coder_count'Image & " coders", detailed);
805
806 -- Compute the entropy coders based on the current
807 -- clustering, then refine the (group -> cluster) attribution.
808 -- A group can join another cluster if the number of bits
809 -- in the output is smaller. However, it will influence the
810 -- frequencies of both affected clusters.
811 --
812 for iteration in 1 .. reclassification_iteration_limit loop
813 Cluster_Statistics;
814 Define_Descriptors;
815 Simulate_Entropy_Coding_Variants_and_Reclassify;
816 Trace
817 (" Iteration" & iteration'Image &
818 ". Defector groups:" & defector_groups'Image,
819 detailed);
820 exit when defector_groups = 0;
821 end loop;
822 if defector_groups > 0 then
823 -- The cluster optimization loop has exited before
824 -- full stabilization (clusters have changed).
825 Define_Descriptors;
826 end if;
827 Cluster_Statistics;
828 end Construct;
829
830 function Compute_Total_Entropy_Cost return Natural_32 is
831 -- We simulate the sending of the whole block and
832 -- look at the costs related to the entropy coding.
833
834 function Compute_Selectors_Cost return Natural_32 is
835 value : array (1 .. entropy_coder_count) of Positive;
836 mtf_idx : Positive;
837 bits : Natural_32 := 0;
838 begin
839 for w in value'Range loop
840 value (w) := w;
841 end loop;
842 for i in Selector_Range loop
843 for search in value'Range loop
844 if value (search) = selector (i) then
845 mtf_idx := search;
846 exit;
847 end if;
848 end loop;
849 for j in reverse 2 .. mtf_idx loop
850 value (j) := value (j - 1);
851 end loop;
852 value (1) := selector (i);
853 bits := bits + Integer_32 (mtf_idx);
854 end loop;
855 return bits;
856 end Compute_Selectors_Cost;
857
858 function Compute_Huffman_Bit_Lengths_Cost return Natural_32 is
859 current_bit_length, new_bit_length : Natural;
860 bits : Natural_32 := 0;
861 begin
862 for coder in 1 .. entropy_coder_count loop
863 current_bit_length := descr (coder)(0).bit_length;
864 bits := bits + 5;
865 for i in 0 .. last_symbol_in_use loop
866 new_bit_length := descr (coder)(i).bit_length;
867 Adjust_Bit_length :
868 loop
869 if current_bit_length = new_bit_length then
870 bits := bits + 1;
871 exit Adjust_Bit_length;
872 else
873 bits := bits + 2;
874 if current_bit_length < new_bit_length then
875 current_bit_length := current_bit_length + 1;
876 else
877 current_bit_length := current_bit_length - 1;
878 end if;
879 end if;
880 end loop Adjust_Bit_length;
881 end loop;
882 end loop;
883 return bits;
884 end Compute_Huffman_Bit_Lengths_Cost;
885
886 pos_countdown : Natural := group_size;
887 selector_idx : Positive_32 := 1;
888 cluster : Entropy_Coder_Range := selector (1);
889 bits : Natural_32 := 0;
890 begin
891 -- Simulate the sending of the data itself:
892 for mtf_idx in 1 .. mtf_last loop
893 bits := bits + Natural_32 (descr (cluster)(mtf_data (mtf_idx)).bit_length);
894 pos_countdown := pos_countdown - 1;
895 if pos_countdown = 0 and then mtf_idx < mtf_last then
896 pos_countdown := group_size;
897 selector_idx := selector_idx + 1;
898 cluster := selector (selector_idx);
899 end if;
900 end loop;
901 -- We add to the compressed data cost, the cost of switching coders
902 -- over the whole block and the cost of sending the compression
903 -- structures of the coders.
904 bits := bits + Compute_Selectors_Cost + Compute_Huffman_Bit_Lengths_Cost;
905 return bits;
906 end Compute_Total_Entropy_Cost;
907
908 cost : Natural_32;
909 best_cost : Natural_32 := Natural_32'Last;
910 best_ec_count : Entropy_Coder_Range;
911 best_max_code_len : Positive;
912 best_sample_width : Natural;
913
914 type Value_Array is array (Positive range <>) of Natural;
915
916 -----------------------------------------------------------------
917 -- Choices for brute-force search of the best entropy coding --
918 -----------------------------------------------------------------
919
920 max_code_len_choices : constant Value_Array :=
921 (case option is
922 when block_100k => (1 => 16),
923 when block_400k => (1 => 16),
924 when block_900k => (15, 17));
925
926 coder_choices : constant Value_Array :=
927 (case option is
928 when block_100k => (4, 6),
929 when block_400k => (4, 6),
930 when block_900k =>
931 (case mtf_last is
932 when 1 .. 5_000 => (2, 3, 6),
933 when 5_001 .. 10_000 => (3, 4, 6),
934 when others => (3, 4, 5, 6)));
935
936 sample_width_choices : constant Value_Array :=
937 (case option is
938 when block_100k => (1 => 4),
939 when block_400k => (1 => 4),
940 when block_900k => (3, 4));
941
942 -- In a former version, we had 210 combinations of
943 -- brute-force choices for option block_900k, making
944 -- that option run 13x longer that with only 1 combination!
945 -- See also timings in doc/za_work.xls, sheet BZip2.
946
947 begin
948 -- Brute-force: test some max code lengths:
949 for max_code_len_test of max_code_len_choices loop
950 max_code_len := max_code_len_test;
951 -- Brute-force: test some sample widths:
952 for sample_width_test of sample_width_choices loop
953 -- Brute-force: test some amounts of entropy coders:
954 for ec_test in reverse min_entropy_coders .. max_entropy_coders loop
955 if low_cluster_usage
956 -- ^ At least one cluster of previous iteration is not used much.
957 or else (for some value of coder_choices => value = ec_test)
958 then
959 entropy_coder_count := ec_test;
960 Construct (sample_width_test);
961 cost := Compute_Total_Entropy_Cost;
962 if cost < best_cost then
963 best_cost := cost;
964 best_ec_count := ec_test;
965 best_max_code_len := max_code_len;
966 best_sample_width := sample_width_test;
967 end if;
968 end if;
969 end loop;
970 end loop;
971 end loop;
972
973 max_code_len := best_max_code_len;
974 entropy_coder_count := best_ec_count;
975 Trace
976 ("Max len:" & max_code_len'Image &
977 ", coders:" & entropy_coder_count'Image &
978 ", sample width:" & best_sample_width'Image,
979 detailed);
980 Construct (best_sample_width);
981 end Multiple_Entropy_Coders;
982
983 trace_frequency_matrix : constant Boolean := False;
984 use_single_coder : constant Boolean := False;
985
986 begin
987 selector_count := 1 + (mtf_last - 1) / group_size;
988 if trace_frequency_matrix then
989 Output_Frequency_Matrix;
990 end if;
991
992 if use_single_coder then
993 Single_Entropy_Coder;
994 else
995 Multiple_Entropy_Coders;
996 end if;
997 end Entropy_Calculations;
998
999 ---------------------------------
1000 -- Output of compressed data --
1001 ---------------------------------
1002
1003 procedure Put_Block_Header is
1004 begin
1005 Put_Bits (block_header_magic);
1006 block_crc := CRC.Final (block_crc);
1007 Put_Bits (block_crc, 32);
1008 Trace ("Block CRC: " & block_crc'Image, detailed);
1009 combined_crc := Rotate_Left (combined_crc, 1) xor block_crc;
1010 Trace ("Combined CRC:" & combined_crc'Image, detailed);
1011 Put_Bits (0, 1); -- Randomized flag, always False.
1012 Put_Bits (Unsigned_32 (bwt_index), 24);
1013 end Put_Block_Header;
1014
1015 procedure Put_Block_Trees_Descriptors is
1016
1017 procedure Put_Mapping_Table is
1018 in_use_16 : array (Byte range 0 .. 15) of Boolean := (others => False);
1019 begin
1020 for i in in_use_16'Range loop
1021 for j in in_use_16'Range loop
1022 if in_use (i * 16 + j) then
1023 in_use_16 (i) := True;
1024 end if;
1025 end loop;
1026 end loop;
1027
1028 -- Send the first 16 bits which tell which pieces are stored.
1029 for i in in_use_16'Range loop
1030 Put_Bits (in_use_16 (i));
1031 end loop;
1032 -- Send detail of the used pieces.
1033 for i in in_use_16'Range loop
1034 if in_use_16 (i) then
1035 for j in in_use_16'Range loop
1036 Put_Bits (in_use (i * 16 + j));
1037 end loop;
1038 end if;
1039 end loop;
1040 end Put_Mapping_Table;
1041
1042 procedure Put_Selectors is
1043 value : array (1 .. entropy_coder_count) of Positive;
1044 mtf_idx : Positive;
1045 begin
1046 Put_Bits (Unsigned_32 (selector_count), 15);
1047 for w in value'Range loop
1048 -- We start with 1, 2, 3, ...:
1049 value (w) := w;
1050 end loop;
1051 for i in 1 .. selector_count loop
1052 for search in value'Range loop
1053 if value (search) = selector (i) then
1054 mtf_idx := search;
1055 exit;
1056 end if;
1057 end loop;
1058 -- Move the value to the first place.
1059 for j in reverse 2 .. mtf_idx loop
1060 value (j) := value (j - 1);
1061 end loop;
1062 value (1) := selector (i);
1063 -- MTF-transformed index for the selected entropy coder.
1064 for bar in 1 .. mtf_idx - 1 loop
1065 -- Output as many '1' bit as the value of mtf_idx - 1:
1066 Put_Bits (1, 1);
1067 end loop;
1068 Put_Bits (0, 1);
1069 end loop;
1070 end Put_Selectors;
1071
1072 procedure Put_Huffman_Bit_Lengths is
1073 current_bit_length, new_bit_length : Natural;
1074 begin
1075 for coder in 1 .. entropy_coder_count loop
1076 current_bit_length := descr (coder)(0).bit_length;
1077 Put_Bits (Unsigned_32 (current_bit_length), 5);
1078 for i in 0 .. last_symbol_in_use loop
1079 new_bit_length := descr (coder)(i).bit_length;
1080 Adjust_Bit_length :
1081 loop
1082 if current_bit_length = new_bit_length then
1083 Put_Bits (0, 1);
1084 exit Adjust_Bit_length;
1085 else
1086 Put_Bits (1, 1);
1087 if current_bit_length < new_bit_length then
1088 current_bit_length := current_bit_length + 1;
1089 Put_Bits (0, 1);
1090 else
1091 current_bit_length := current_bit_length - 1;
1092 Put_Bits (1, 1);
1093 end if;
1094 end if;
1095 end loop Adjust_Bit_length;
1096 end loop;
1097 end loop;
1098 end Put_Huffman_Bit_Lengths;
1099
1100 begin
1101 Put_Mapping_Table;
1102 Put_Bits (Unsigned_32 (entropy_coder_count), 3);
1103 Put_Selectors;
1104 Put_Huffman_Bit_Lengths;
1105 end Put_Block_Trees_Descriptors;
1106
1107 procedure Entropy_Output is
1108 pos_countdown : Natural := group_size;
1109 selector_idx : Positive_32 := 1;
1110 cluster : Entropy_Coder_Range := selector (1);
1111 symbol : Max_Alphabet;
1112 begin
1113 for mtf_idx in 1 .. mtf_last loop
1114 symbol := mtf_data (mtf_idx);
1115
1116 Put_Bits
1117 (Unsigned_32
1118 (descr (cluster) (symbol).code),
1119 descr (cluster) (symbol).bit_length);
1120
1121 pos_countdown := pos_countdown - 1;
1122 if pos_countdown = 0 and then mtf_idx < mtf_last then
1123 pos_countdown := group_size;
1124 selector_idx := selector_idx + 1;
1125 cluster := selector (selector_idx);
1126 end if;
1127 end loop;
1128
1129 Unchecked_Free (mtf_data);
1130 end Entropy_Output;
1131
1132 begin
1133 block_counter := block_counter + 1;
1134 Trace ("Block" & block_counter'Image, headlines);
1135
1136 -- Data transformation (no output):
1137 RLE_1;
1138 BWT;
1139 MTF_and_RLE_2;
1140 Entropy_Calculations;
1141
1142 -- Now we output the block's compressed data:
1143 Put_Block_Header;
1144 Put_Block_Trees_Descriptors;
1145 Entropy_Output;
1146
1147 exception
1148 when others =>
1149 Unchecked_Free (rle_1_data);
1150 Unchecked_Free (bwt_data);
1151 Unchecked_Free (mtf_data);
1152 raise;
1153 end Encode_Block;
1154
1155 stream_rest : Stream_Size_Type := size_hint;
1156
1157 --------------------------------------------
1158 -- Data acquisition and block splitting --
1159 --------------------------------------------
1160
1161 procedure Read_and_Split_Block (dyn_block_capacity : Natural_32) is
1162
1163 -- In the cases RLE_1 compression is efficient, the
1164 -- input buffer can contain much more that the post RLE_1 block.
1165 -- Best case: all runs of 259 bytes, factor 259/5 = 51.8.
1166 -- The latter has to fit into the agreed capacity (a multiple of 100_000).
1167 -- So, we define a conveniently large input buffer.
1168
1169 multiplier : constant := 10;
1170
1171 raw_buf : Buffer_Access := new Buffer (1 .. multiplier * dyn_block_capacity);
1172
1173 package Segmentation_for_BZip2 is
1174 new Data_Segmentation
1175 (Index => Natural_32,
1176 Alphabet => Byte,
1177 Buffer_Type => Buffer,
1178 discrepancy_threshold => 2.0,
1179 index_threshold => 80_000,
1180 window_size => 80_000);
1181
1182 single_segment : constant Boolean := False;
1183 seg : Segmentation_for_BZip2.Segmentation;
1184
1185 raw_buf_index : Natural_32 := 0;
1186 index_start : Natural_32 := 1;
1187
1188 -- We have to simulate RLE_1 to avoid block size overflows
1189 -- in the decoder.
1190 -- RLE_1 often expands the data (and sometimes does it
1191 -- considerably) when it meets runs of length 4: 5 bytes are
1192 -- stored in that case.
1193 -- So the worst case expansion is by a factor 5/4.
1194
1195 rle_1_block_size : Natural_32 := 0;
1196 b : Byte;
1197 b_prev : Byte := 0; -- Initialization is to reassure the compiler.
1198 run : Natural := 0;
1199
1200 procedure Simulate_Store_Run with Inline is
1201 begin
1202 rle_1_block_size := rle_1_block_size + Integer_32 (Integer'Min (4, run));
1203 if run >= 4 then
1204 pragma Assert (run <= 259);
1205 rle_1_block_size := rle_1_block_size + 1;
1206 end if;
1207 run := 1;
1208 end Simulate_Store_Run;
1209
1210 start : Boolean := True;
1211
1212 begin
1213 -- Data acquisition:
1214 while More_Bytes
1215 and then rle_1_block_size + 5 < dyn_block_capacity
1216 -- ^ The +5 is because sometimes a pack of max 5 bytes is sent by Store_Run.
1217 and then raw_buf_index < raw_buf'Last
1218 loop
1219 b := Read_Byte;
1220 raw_buf_index := raw_buf_index + 1;
1221 raw_buf (raw_buf_index) := b;
1222 if stream_rest /= unknown_size then
1223 stream_rest := stream_rest - 1;
1224 end if;
1225 if start or else b /= b_prev then
1226 -- Startup or Run break:
1227 Simulate_Store_Run;
1228 start := False;
1229 elsif run = 259 then
1230 -- Force a run break, even though b = b_prev:
1231 Simulate_Store_Run;
1232 else
1233 run := run + 1;
1234 end if;
1235 b_prev := b;
1236 end loop;
1237 Simulate_Store_Run;
1238
1239 if single_segment then
1240 -- No segmentation /splitting:
1241 Encode_Block (raw_buf (1 .. raw_buf_index));
1242 else
1243 Segmentation_for_BZip2.Segment_by_Entropy (raw_buf (1 .. raw_buf_index), seg);
1244
1245 if seg.Is_Empty then
1246 Encode_Block (raw_buf (1 .. 0));
1247 else
1248 for s of seg loop
1249 Encode_Block (raw_buf (index_start .. s));
1250 index_start := s + 1;
1251 end loop;
1252 end if;
1253
1254 if Integer (seg.Length) > 1 then
1255 Trace ("Segmentation into" & seg.Length'Image & " segments", headlines);
1256 for s of seg loop
1257 Trace (" Segment limit at" & s'Image, headlines);
1258 end loop;
1259 end if;
1260
1261 end if;
1262
1263 Unchecked_Free (raw_buf);
1264 exception
1265 when others =>
1266 Unchecked_Free (raw_buf);
1267 raise;
1268 end Read_and_Split_Block;
1269
1270 procedure Write_Stream_Header is
1271 magic : String := stream_header_magic;
1272 begin
1273 magic (magic'Last) := Character'Val (Character'Pos ('0') + level);
1274 Put_Bits (magic);
1275 end Write_Stream_Header;
1276
1277 procedure Write_Stream_Footer is
1278 begin
1279 Put_Bits (stream_footer_magic);
1280 Put_Bits (combined_crc, 32);
1281 if bit_pos < 7 then
1282 Flush_Bit_Buffer;
1283 end if;
1284 end Write_Stream_Footer;
1285
1286 -- Vertically challenged blocks.
1287 small_block_prop_min : constant := 0.05; -- Below that, not worth the trouble.
1288 small_block_prop_max : constant := 0.30; -- Above that, not worth the trouble either.
1289
1290 begin
1291 Write_Stream_Header;
1292 loop
1293 if Float (stream_rest) in
1294 Float (block_capacity) * (1.0 + small_block_prop_min) ..
1295 Float (block_capacity) * (1.0 + small_block_prop_max)
1296 then
1297 -- Avoid encoding the last block as a "too" small one (poorer compression)
1298 -- if we can balance the last two blocks.
1299 -- NB: a more sophisticated balancing using (1.0 - small_block_prop_max)
1300 -- did not deliver convincing results.
1301 Read_and_Split_Block (Natural_32 (stream_rest) / 2);
1302 else
1303 Read_and_Split_Block (block_capacity);
1304 end if;
1305 exit when not More_Bytes;
1306 end loop;
1307 Write_Stream_Footer;
1308 end Encode;
1309
1310 end BZip2.Encoding;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.