Source file : zip-compress.adb
1 -- Legal licensing note:
2
3 -- Copyright (c) 2007 .. 2024 Gautier de Montmollin
4 -- SWITZERLAND
5
6 -- Permission is hereby granted, free of charge, to any person obtaining a copy
7 -- of this software and associated documentation files (the "Software"), to deal
8 -- in the Software without restriction, including without limitation the rights
9 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 -- copies of the Software, and to permit persons to whom the Software is
11 -- furnished to do so, subject to the following conditions:
12
13 -- The above copyright notice and this permission notice shall be included in
14 -- all copies or substantial portions of the Software.
15
16 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 -- THE SOFTWARE.
23
24 -- NB: this is the MIT License, as found on the site
25 -- http://www.opensource.org/licenses/mit-license.php
26
27 with Zip.Create,
28 Zip.Compress.Shrink,
29 Zip.Compress.Reduce,
30 Zip.Compress.Deflate,
31 Zip.Compress.BZip2_E,
32 Zip.Compress.LZMA_E;
33
34 with Ada.Characters.Handling,
35 Ada.Numerics.Discrete_Random,
36 Ada.Strings.Fixed,
37 Ada.Unchecked_Deallocation;
38
39 package body Zip.Compress is
40
41 use Zip_Streams, Zip.CRC_Crypto;
42
43 -- The following procedure's purpose is to detect size overflows
44 -- for Zip data. Even when the input size is known, we can have
45 -- the situation where data is random and the compressed output size
46 -- overflows.
47
48 procedure Increment
49 (out_size : in out Zip_64_Data_Size_Type;
50 by : in Natural)
51 is
52 temp_by : constant ZS_Size_Type := ZS_Size_Type (by);
53 use type Zip_64_Data_Size_Type, ZS_Size_Type;
54 begin
55 if temp_by > Create.max_size then
56 raise Create.Zip_Capacity_Exceeded with
57 "Compressed data too large: size is 2 EiB (Exbibytes) or more.";
58 end if;
59 out_size := out_size + Zip_64_Data_Size_Type (by);
60 end Increment;
61
62 default_byte_IO_buffer_size : constant := 1024 * 1024; -- 1 MiB
63
64 -------------------
65 -- Compress_data --
66 -------------------
67
68 procedure Compress_Data
69 (input,
70 output : in out Zip_Streams.Root_Zipstream_Type'Class;
71 input_size_known : in Boolean;
72 input_size : in Zip_64_Data_Size_Type; -- ignored if input_size_known = False
73 method : in Compression_Method;
74 feedback : in Feedback_Proc;
75 password : in String;
76 content_hint : in Data_Content_Type;
77 CRC : out Interfaces.Unsigned_32;
78 output_size : out Zip_64_Data_Size_Type;
79 zip_type : out Interfaces.Unsigned_16)
80 is
81 use Interfaces;
82 user_aborting : Boolean;
83 idx_in : constant ZS_Index_Type := Index (input);
84 idx_out : constant ZS_Index_Type := Index (output);
85 compression_ok : Boolean;
86 first_feedback : Boolean := True;
87 --
88 is_encrypted : constant Boolean := password /= "";
89 encrypt_pack, mem_encrypt_pack : Crypto_pack;
90 encrypt_header : Byte_Buffer (1 .. 12);
91 package Byte_soup is new Ada.Numerics.Discrete_Random (Byte);
92 use Byte_soup;
93 cg : Byte_soup.Generator;
94 --
95 -- Store data as is, or, if do_write = False, just compute CRC (this is for encryption).
96 --
97 procedure Store_data (do_write : Boolean) is
98 Buffer : Byte_Buffer (1 .. default_byte_IO_buffer_size);
99 Last_Read : Natural;
100 counted : Zip_64_Data_Size_Type := 0;
101 begin
102 zip_type := Compression_format_code.store_code;
103 while not End_Of_Stream (input) loop
104 if input_size_known and then counted >= input_size then
105 exit;
106 end if;
107 -- Copy data
108 Block_Read (input, Buffer, Last_Read);
109 Increment (counted, Last_Read);
110 Update (CRC, Buffer (1 .. Last_Read));
111 if do_write then
112 Encode (encrypt_pack, Buffer (1 .. Last_Read));
113 Block_Write (output, Buffer (1 .. Last_Read));
114 end if;
115 -- Feedback
116 if feedback /= null and then
117 (first_feedback or (counted mod (2**16) = 0) or
118 (input_size_known and then counted = input_size))
119 then
120 if input_size_known then
121 feedback (
122 percents_done =>
123 Natural ((100.0 * Float (counted)) / Float (input_size)),
124 entry_skipped => False,
125 user_abort => user_aborting);
126 else
127 feedback (
128 percents_done => 0,
129 entry_skipped => False,
130 user_abort => user_aborting);
131 end if;
132 first_feedback := False;
133 if user_aborting then
134 raise User_abort;
135 end if;
136 end if;
137 end loop;
138 output_size := counted;
139 compression_ok := True;
140 end Store_data;
141 --
142 procedure Compress_data_single_method (actual_method : Single_Method) is
143 begin
144 Init (CRC);
145 if is_encrypted then
146 Init_Keys (encrypt_pack, password);
147 Set_Mode (encrypt_pack, encrypted);
148 -- A bit dumb from Zip spec: we need to know the final CRC in order to set up
149 -- the last byte of the encryption header, that allows for detecting if a password
150 -- is OK - this, with 255/256 probability of correct detection of a wrong password!
151 -- Result: 1st scan of the whole input stream with CRC calculation:
152 Store_data (do_write => False);
153 Reset (cg);
154 for i in 1 .. 11 loop
155 encrypt_header (i) := Random (cg);
156 end loop;
157 encrypt_header (12) := Byte (Shift_Right (Final (CRC), 24));
158 Set_Index (input, idx_in);
159 Init (CRC);
160 Encode (encrypt_pack, encrypt_header);
161 Block_Write (output, encrypt_header);
162 --
163 -- We need to remember at this point the encryption keys in case we need
164 -- to rewrite from here (compression failed, store data).
165 --
166 mem_encrypt_pack := encrypt_pack;
167 else
168 Set_Mode (encrypt_pack, clear);
169 end if;
170 --
171 -- Dispatch the work to child procedures doing the stream compression
172 -- in different formats, depending on the actual compression method.
173 -- For example, for methods LZMA_for_JPEG, LZMA_for_WAV, or LZMA_3, we
174 -- logically call Zip.Compress.LZMA_E for the job.
175 --
176 case actual_method is
177
178 when Store =>
179 Store_data (do_write => True);
180
181 when Shrink =>
182 Zip.Compress.Shrink
183 (input, output, input_size_known, input_size, feedback,
184 CRC, encrypt_pack, output_size, compression_ok);
185 zip_type := Compression_format_code.shrink_code;
186
187 when Reduction_Method =>
188 Zip.Compress.Reduce
189 (input, output, input_size_known, input_size, feedback,
190 actual_method,
191 CRC, encrypt_pack, output_size, compression_ok);
192 zip_type := Compression_format_code.reduce_code +
193 Unsigned_16
194 (Compression_Method'Pos (actual_method)
195 Compression_Method'Pos (Reduce_1));
196
197 when Deflation_Method =>
198 Zip.Compress.Deflate
199 (input, output, input_size_known, input_size, feedback,
200 actual_method,
201 CRC, encrypt_pack, output_size, compression_ok);
202 zip_type := Compression_format_code.deflate_code;
203
204 when BZip2_Method =>
205 Zip.Compress.BZip2_E
206 (input, output, input_size_known, input_size, feedback,
207 actual_method,
208 CRC, encrypt_pack, output_size, compression_ok);
209 zip_type := Compression_format_code.bzip2_code;
210
211 when LZMA_Method =>
212 Zip.Compress.LZMA_E
213 (input, output, input_size_known, input_size, feedback,
214 actual_method,
215 CRC, encrypt_pack, output_size, compression_ok);
216 zip_type := Compression_format_code.lzma_code;
217 end case;
218 CRC := Final (CRC);
219 --
220 -- Handle case where compression has been unefficient:
221 -- data to be compressed is too "random"; then compressed data
222 -- happen to be larger than uncompressed data
223 --
224 if not compression_ok then
225 -- Go back to the beginning and just store the data
226 Set_Index (input, idx_in);
227 if is_encrypted then
228 Set_Index (output, idx_out + 12);
229 -- Restore the encryption keys to their state just after the encryption header:
230 encrypt_pack := mem_encrypt_pack;
231 else
232 Set_Index (output, idx_out);
233 end if;
234 Init (CRC);
235 Store_data (do_write => True);
236 CRC := Final (CRC);
237 end if;
238 if is_encrypted then
239 output_size := output_size + 12;
240 end if;
241 end Compress_data_single_method;
242
243 fast_presel_threshold : constant := 10_000;
244 bzip2_threshold : constant := 15_000;
245
246 fast_presel : constant Boolean :=
247 method = Preselection_1 or (input_size_known and then input_size < fast_presel_threshold);
248
249 data_type_to_LZMA_method : constant array (Data_Content_Type) of LZMA_Method :=
250 (JPEG => LZMA_for_JPEG,
251 ARW_RW2 => LZMA_for_ARW,
252 ORF_CR2 => LZMA_for_ORF,
253 MP3 => LZMA_for_MP3,
254 MP4 => LZMA_for_MP4,
255 PGM => LZMA_for_PGM,
256 PPM => LZMA_for_PPM,
257 PNG => LZMA_for_PNG,
258 WAV => LZMA_for_WAV,
259 AU => LZMA_for_AU,
260 others => LZMA_1); -- Fake, should be unused as such.
261
262 begin
263 case method is
264 --
265 when Single_Method =>
266 Compress_data_single_method (method);
267 --
268 when Preselection_Method =>
269 case content_hint is
270 when neutral | text_data =>
271 if input_size_known and then input_size < 9_000 then
272 Compress_data_single_method (Deflate_3); -- Deflate
273 elsif fast_presel then
274 -- See: Optimum, LZ77 sheet in za_work.xls
275 -- or l2_vs_l3.xls with a larger data set.
276 Compress_data_single_method (LZMA_2); -- LZMA with IZ_10 match finder
277 else
278 Compress_data_single_method (LZMA_3); -- LZMA with BT4 match finder
279 end if;
280
281 when ARW_RW2 | ORF_CR2 | MP3 | MP4 | JPEG | PGM | PPM | PNG | WAV | AU =>
282 if input_size_known and then input_size < 2_250 then
283 Compress_data_single_method (Deflate_3); -- Deflate
284 else
285 Compress_data_single_method (data_type_to_LZMA_method (content_hint));
286 end if;
287
288 when GIF =>
289 if input_size_known and then input_size < 350 then
290 Compress_data_single_method (Deflate_1);
291 else
292 Compress_data_single_method (LZMA_for_GIF);
293 end if;
294
295 when Zip_in_Zip =>
296 if input_size_known and then input_size < 1_000 then
297 Compress_data_single_method (Deflate_3); -- Deflate
298 elsif fast_presel then
299 Compress_data_single_method (LZMA_2_for_Zip_in_Zip);
300 else
301 Compress_data_single_method (LZMA_3_for_Zip_in_Zip);
302 end if;
303
304 when source_code =>
305 if input_size_known and then input_size < 8_000 then
306 Compress_data_single_method (Deflate_3); -- Deflate
307 elsif fast_presel then
308 Compress_data_single_method (LZMA_2_for_Source);
309 elsif input_size_known and then input_size < bzip2_threshold then
310 Compress_data_single_method (LZMA_3_for_Source);
311 else
312 Compress_data_single_method (BZip2_Method'Last);
313 end if;
314
315 when text_formatted_text_or_dna =>
316 if input_size_known and then input_size < 9_000 then
317 Compress_data_single_method (Deflate_3);
318 elsif fast_presel then
319 Compress_data_single_method (LZMA_2);
320 elsif input_size_known and then input_size < bzip2_threshold then
321 Compress_data_single_method (LZMA_3);
322 else
323 Compress_data_single_method (BZip2_Method'Last);
324 end if;
325
326 end case;
327 end case;
328 end Compress_Data;
329
330 function Guess_Type_from_Name (name : String) return Data_Content_Type is
331 use Ada.Characters.Handling, Ada.Strings, Ada.Strings.Fixed;
332 up : constant String := To_Upper (name);
333 dot : constant Natural := Index (up, ".", Backward);
334 begin
335 if dot = 0 then
336 return neutral;
337 end if;
338 declare
339 ext : constant String := up (dot + 1 .. up'Last);
340 begin
341 if ext in "JPG" | "JPEG" then
342 return JPEG;
343 end if;
344 if ext in
345 "A" | "ADA" | "ADS" | "ADB" | -- Ada
346 "PRC" | "PKG" | "HAC" | "GPR" |
347 "F" | "FOR" | -- Fortran
348 "C" | "H" | "CPP" | "HPP" | -- C/C++
349 "DEF" | "ASM" | -- Assembler
350 "JAVA" | "CS" |
351 "PAS" | "INC" | "LPR" | "PP" | -- Pascal
352 "M" | -- Matlab
353 "M4" | "MAK" | "IN" | -- Macro assembler
354 "SH" | "BAT" | "CMD" | -- Operating System Script
355 "PO" | -- GNU PO
356 "XML" | "XSL" |
357 "SGML" |
358 "AUP" | -- Audacity project (XML)
359 "HTM" | "HTML" |
360 "JS" | "LSP" | "SCM" |
361 "SQL" | "PDB" | "PL"
362 then
363 return source_code;
364 end if;
365 if ext in "CFG" | "INI" | "LOG" | "CSV" | "SVG" | "JSON" then
366 return text_data;
367 end if;
368 if ext in "TXT" | "RTF" | "HTM" | "HTML" | "GB" | "FASTA" then
369 return text_formatted_text_or_dna;
370 end if;
371 -- Zip archives happen to be zipped...
372 if ext in
373 "EPUB" | -- EPUB: e-book reader format
374 "ZIP" |
375 "JAR" |
376 "ODB" | "ODS" | "ODT" | "OTR" | "OTS" | "OTT" |
377 "CRX" | "NTH" |
378 "DOCX" | "PPTX" | "XLSX" | "XLSB" | "XLSM"
379 then
380 return Zip_in_Zip;
381 end if;
382 -- Some raw camera picture data
383 if ext in "ORF" | -- Raw Olympus
384 "CR2" | -- Raw Canon
385 "RAF" | -- Raw Fujifilm
386 "SRW" -- Raw Samsung
387 then
388 return ORF_CR2;
389 end if;
390 if ext in "ARW" | -- Raw Sony
391 "RW2" | -- Raw Panasonic
392 "NEF" | -- Raw Nikon
393 "DNG" | -- Raw Leica, Pentax
394 "X3F" -- Raw Sigma
395 then
396 return ARW_RW2;
397 end if;
398 if ext = "PGM" then
399 return PGM;
400 end if;
401 if ext = "PPM" then
402 return PPM;
403 end if;
404 if ext = "MP3" then
405 return MP3;
406 end if;
407 if ext in "MTS" | "MP4" | "M4A" | "M4P" then
408 return MP4;
409 end if;
410 if ext = "PNG" then
411 return PNG;
412 end if;
413 if ext = "GIF" then
414 return GIF;
415 end if;
416 if ext in "WAV" | "UAX" then
417 return WAV;
418 end if;
419 if ext = "AU" then -- Audacity raw data
420 return AU;
421 end if;
422 end;
423 return neutral;
424 end Guess_Type_from_Name;
425
426 -----------------------------------
427 -- I/O buffers for compression --
428 -----------------------------------
429
430 procedure Allocate_Buffers
431 (b : in out IO_Buffers_Type;
432 input_size_known : Boolean;
433 input_size : Zip_64_Data_Size_Type)
434 is
435 calibration : Zip_64_Data_Size_Type := default_byte_IO_buffer_size;
436 begin
437 if input_size_known then
438 calibration :=
439 Zip_64_Data_Size_Type'Min
440 (default_byte_IO_buffer_size,
441 Zip_64_Data_Size_Type'Max (8, input_size));
442 end if;
443 b.InBuf := new Byte_Buffer (1 .. Integer (calibration));
444 b.OutBuf := new Byte_Buffer (1 .. default_byte_IO_buffer_size);
445 end Allocate_Buffers;
446
447 procedure Deallocate_Buffers (b : in out IO_Buffers_Type) is
448 procedure Dispose_Buffer is
449 new Ada.Unchecked_Deallocation (Byte_Buffer, p_Byte_Buffer);
450 begin
451 Dispose_Buffer (b.InBuf);
452 Dispose_Buffer (b.OutBuf);
453 end Deallocate_Buffers;
454
455 procedure Read_Block
456 (b : in out IO_Buffers_Type;
457 input : in out Zip_Streams.Root_Zipstream_Type'Class)
458 is
459 begin
460 Zip.Block_Read
461 (stream => input,
462 buffer => b.InBuf.all,
463 actually_read => b.MaxInBufIdx);
464 b.InputEoF := b.MaxInBufIdx = 0;
465 b.InBufIdx := 1;
466 end Read_Block;
467
468 procedure Write_Block
469 (b : in out IO_Buffers_Type;
470 input_size_known : Boolean;
471 input_size : Zip_64_Data_Size_Type;
472 output : in out Zip_Streams.Root_Zipstream_Type'Class;
473 output_size : in out Zip_64_Data_Size_Type;
474 crypto : in out Zip.CRC_Crypto.Crypto_pack)
475 is
476 amount : constant Integer := b.OutBufIdx - 1;
477 use type Zip_64_Data_Size_Type;
478 begin
479 Increment (output_size, Integer'Max (0, amount));
480 if input_size_known and then output_size >= input_size then
481 -- The compression so far is obviously inefficient for that file.
482 -- Useless to go further.
483 -- Stop immediately before growing the file more than the
484 -- uncompressed size.
485 raise Compression_inefficient;
486 end if;
487 Encode (crypto, b.OutBuf (1 .. amount));
488 Zip.Block_Write (output, b.OutBuf (1 .. amount));
489 b.OutBufIdx := 1;
490 end Write_Block;
491
492 end Zip.Compress;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.