Source file : lzma.ads
1 -- LZMA library
2 ----------------
3 -- Library for encoding and decoding data streams in the LZMA compression
4 -- format invented by Igor Pavlov.
5 --
6 -- Pure Ada 95+ code, 100% portable: OS-, CPU- and compiler- independent.
7
8 -- Legal licensing note:
9
10 -- Copyright (c) 2016 .. 2019 Gautier de Montmollin
11 -- SWITZERLAND
12
13 -- Permission is hereby granted, free of charge, to any person obtaining a copy
14 -- of this software and associated documentation files (the "Software"), to deal
15 -- in the Software without restriction, including without limitation the rights
16 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 -- copies of the Software, and to permit persons to whom the Software is
18 -- furnished to do so, subject to the following conditions:
19
20 -- The above copyright notice and this permission notice shall be included in
21 -- all copies or substantial portions of the Software.
22
23 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 -- THE SOFTWARE.
30
31 -- NB: this is the MIT License, as found 21-Aug-2016 on the site
32 -- http://www.opensource.org/licenses/mit-license.php
33
34 with Ada.Direct_IO; -- Only used for the type Data_Bytes_Count below.
35 with Interfaces;
36 with System;
37
38 package LZMA is
39
40 -- The compression and decompression procedures are located
41 -- in child packages LZMA.Encoding and LZMA.Decoding respectively.
42
43 -- Bits of last byte being used as context.
44 -- With the value 8, LZMA uses a complete Markov chain for predicting
45 -- a literal from the previous one, like PKZip's Reduce format.
46 subtype Literal_Context_Bits_Range is Integer range 0 .. 8;
47
48 -- Position mod 2**bits is used, but for literal context only.
49 subtype Literal_Position_Bits_Range is Integer range 0 .. 4;
50
51 -- Position mod 2**bits is used in various places.
52 subtype Position_Bits_Range is Integer range 0 .. 4;
53
54 Default_dictionary_size : constant := 2 ** 15; -- 32 KiB, like Deflate.
55
56 subtype Byte is Interfaces.Unsigned_8;
57
58 -- Ada.Direct_IO is only there for the Data_Bytes_Count type.
59 -- In case you want to avoid reference to Ada.Direct_IO,
60 -- you can customize the definition of Data_Bytes_Count, provided
61 -- it has enough capacity for counting bytes in the streams involved.
62 package BIO is new Ada.Direct_IO (Byte);
63 subtype Data_Bytes_Count is BIO.Count;
64
65 private
66
67 use Interfaces;
68
69 -- These integer types are defined in the LZMA specification
70 -- (DRAFT version, 2015-06-14, by Igor Pavlov)
71
72 type Unsigned is mod 2 ** System.Word_Size;
73 subtype UInt64 is Unsigned_64;
74 subtype UInt32 is Unsigned_32;
75 subtype UInt16 is Unsigned_16;
76
77 ----------------------------
78 -- Finite state machine --
79 ----------------------------
80
81 states_count : constant := 12; -- LZMA specification name: "kNumStates"
82 subtype State_range is Unsigned range 0 .. states_count - 1;
83 type Transition is array (State_range) of State_range;
84
85 ------------------------------------ From ... 0 1 2 3 4 5 6 7 8 9 10 11
86 Update_State_Literal : constant Transition := (0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5);
87 Update_State_Match : constant Transition := (7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10);
88 Update_State_Rep : constant Transition := (8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11);
89 Update_State_ShortRep : constant Transition := (9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11);
90
91 -- Context for improving compression of aligned data,
92 -- modulo 2**n = 2, 4, 8 or 16 (max) bytes, or disabled: n = 0.
93 max_pos_bits : constant := 4; -- LZMA specification name: "kNumPosBitsMax"
94 max_pos_states_count : constant := 2**max_pos_bits;
95 subtype Pos_state_range is Unsigned range 0 .. max_pos_states_count - 1;
96
97 ----------------------------------------
98 -- Probability model for bit coding --
99 ----------------------------------------
100
101 probability_model_bits : constant := 11; -- LZMA specification name: "kNumBitModelTotalBits"
102 probability_model_count : constant := 2 ** probability_model_bits;
103
104 probability_change_bits : constant := 5; -- LZMA specification name: "kNumMoveBits"
105
106 -- All probabilities are initialized with p=0.5. LZMA specification name: "PROB_INIT_VAL"
107 initial_probability : constant := probability_model_count / 2;
108
109 -- Type for storing probabilities, must have at least Probability_model_bits bits.
110 -- LZMA specification recommends UInt16. LzmaEnc.c uses UInt16 or optionally UInt32.
111 type CProb is new UInt16;
112
113 -- Integer (signed) used as index because there is a -1 (unused) index in Pos_coder_range.
114 type CProb_array is array (Integer range <>) of CProb;
115
116 align_bits : constant := 4; -- LZMA specification name: "kNumAlignBits"
117 align_table_size : constant := 2 ** align_bits;
118 align_mask : constant := align_table_size - 1;
119
120 subtype Bits_3_range is Integer range 0 .. 2**3 - 1;
121 subtype Bits_6_range is Integer range 0 .. 2**6 - 1;
122 subtype Bits_8_range is Integer range 0 .. 2**8 - 1;
123 subtype Bits_NAB_range is Integer range 0 .. 2**align_bits - 1;
124
125 subtype Probs_3_bits is CProb_array (Bits_3_range);
126 subtype Probs_6_bits is CProb_array (Bits_6_range);
127 subtype Probs_8_bits is CProb_array (Bits_8_range);
128 subtype Probs_NAB_bits is CProb_array (Bits_NAB_range);
129
130 --------------------------------------------------
131 -- Probabilities for the binary decision tree --
132 --------------------------------------------------
133
134 type Probs_State is array (State_range) of CProb;
135 type Probs_State_and_Pos_State is array (State_range, Pos_state_range) of CProb;
136
137 type Probs_for_Switches is record
138 -- This is the context for the switch between a Literal and a LZ Distance-Length code
139 match : Probs_State_and_Pos_State := (others => (others => initial_probability));
140 -- These are contexts for various repetition modes
141 rep : Probs_State := (others => initial_probability);
142 rep_g0 : Probs_State := (others => initial_probability);
143 rep_g1 : Probs_State := (others => initial_probability);
144 rep_g2 : Probs_State := (others => initial_probability);
145 rep0_long : Probs_State_and_Pos_State := (others => (others => initial_probability));
146 end record;
147
148 ------------------------------------
149 -- Probabilities for LZ lengths --
150 ------------------------------------
151
152 type Low_Mid_Coder_Probs is array (Pos_state_range) of Probs_3_bits;
153
154 -- Probabilities used for encoding LZ lengths. LZMA specification name: "CLenDecoder"
155 type Probs_for_LZ_Lengths is record
156 choice_1 : CProb := initial_probability; -- 0: low coder; 1: mid or high
157 choice_2 : CProb := initial_probability; -- 0: mid; 1: high
158 low_coder : Low_Mid_Coder_Probs := (others => (others => initial_probability));
159 mid_coder : Low_Mid_Coder_Probs := (others => (others => initial_probability));
160 high_coder : Probs_8_bits := (others => initial_probability);
161 end record;
162
163 --------------------------------------
164 -- Probabilities for LZ distances --
165 --------------------------------------
166
167 len_to_pos_states : constant := 4;
168 subtype Slot_Coder_Range is Unsigned range 0 .. len_to_pos_states - 1;
169 type Slot_Coder_Probs is array (Slot_Coder_Range) of Probs_6_bits;
170 Dist_slot_bits : constant := 6; -- "kNumPosSlotBits"
171
172 Start_dist_model_index : constant := 4; -- "kStartPosModelIndex"
173 End_dist_model_index : constant := 14; -- LZMA specification name: "kEndPosModelIndex"
174 Num_full_distances : constant := 2 ** (End_dist_model_index / 2); -- "kNumFullDistances"
175
176 -- Pos_coder_range: index -1 is never used as such but appears
177 -- when calling Bit_Tree_Reverse_Encode (as in the original C version, RcTree_ReverseEncode).
178 subtype Pos_coder_range is Integer range -1 .. Num_full_distances - End_dist_model_index;
179 subtype Pos_coder_probs is CProb_array (Pos_coder_range);
180
181 type Probs_for_LZ_Distances is record
182 slot_coder : Slot_Coder_Probs := (others => (others => initial_probability));
183 align_coder : Probs_NAB_bits := (others => initial_probability);
184 pos_coder : Pos_coder_probs := (others => initial_probability);
185 end record;
186
187 --------------------------------------
188 -- All probabilities used by LZMA --
189 --------------------------------------
190
191 type All_probabilities (last_lit_prob_index : Integer) is record
192 -- Literals:
193 lit : CProb_array (0 .. last_lit_prob_index) := (others => initial_probability);
194 -- Distances:
195 dist : Probs_for_LZ_Distances;
196 -- Lengths:
197 len : Probs_for_LZ_Lengths;
198 rep_len : Probs_for_LZ_Lengths;
199 -- Decision tree switches:
200 switch : Probs_for_Switches;
201 end record;
202
203 -------------
204 -- Misc. --
205 -------------
206
207 -- Minimum dictionary (= plain text buffer of n previous bytes)
208 -- size is 4096. LZMA specification name: "LZMA_DIC_MIN"
209 Min_dictionary_size : constant := 2 ** 12;
210
211 -- Log2-style encoding of LZ lengths
212 Len_low_bits : constant := 3;
213 Len_low_symbols : constant := 2 ** Len_low_bits;
214 Len_mid_bits : constant := 3;
215 Len_mid_symbols : constant := 2 ** Len_mid_bits;
216 Len_high_bits : constant := 8;
217 Len_high_symbols : constant := 2 ** Len_high_bits;
218 Len_symbols : constant := Len_low_symbols + Len_mid_symbols + Len_high_symbols;
219
220 Min_match_length : constant := 2; -- "LZMA_MATCH_LEN_MIN"
221 Max_match_length : constant := Min_match_length + Len_symbols - 1; -- "LZMA_MATCH_LEN_MAX"
222
223 subtype Match_length_range is Integer range Min_match_length .. Max_match_length;
224
225 -- Fake distance, used as an end-of-stream marker.
226 end_of_stream_magic_distance : constant := 16#FFFF_FFFF#;
227
228 --------------------------------------------------
229 -- Binary values of various decision switches --
230 --------------------------------------------------
231
232 -- LZ literal vs. DL code
233 Literal_choice : constant := 0;
234 DL_code_choice : constant := 1;
235
236 -- Within DL code: "Simple match" vs. "Rep match"
237 Simple_match_choice : constant := 0;
238 Rep_match_choice : constant := 1;
239
240 -- Within "Rep match": "Distance is rep0" vs. "Distance is not rep0"
241 The_distance_is_rep0_choice : constant := 0;
242 The_distance_is_not_rep0_choice : constant := 1;
243 -- Within "Distance is rep0":
244 The_length_is_1_choice : constant := 0;
245 The_length_is_not_1_choice : constant := 1;
246 -- Within "Distance is not rep0": "Distance is rep1" vs. "Distance is not rep1"
247 The_distance_is_rep1_choice : constant := 0;
248 The_distance_is_not_rep1_choice : constant := 1;
249 -- Within "Distance is not rep1": "Distance is rep2" vs. "Distance is not rep2"
250 The_distance_is_rep2_choice : constant := 0;
251 The_distance_is_not_rep2_choice : constant := 1;
252
253 ----------------------
254 -- Range encoding --
255 ----------------------
256
257 -- Normalization threshold. When the range width is below that value,
258 -- a shift is needed.
259 width_threshold : constant := 2**24; -- LZMA specification name: "kTopValue"
260
261 -- The following article (the only reference in the LZMA specification)
262 -- explains how range encoding works:
263 --
264 -- G. N. N. Martin, Range encoding: an algorithm for removing redundancy
265 -- from a digitized message, Video & Data Recording Conference,
266 -- Southampton, UK, July 24-27, 1979.
267
268 end LZMA;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.