Source file : zip-compress-reduce.adb
--
-- "Reduce" method - probabilistic reduction with a Markov chain.
-- See package specification for details.
--
-- Change log:
--
-- 7-Feb-2009: GdM: added a cache for LZ77 output to make 2nd phase faster
with Interfaces; use Interfaces;
with LZ77, Zip.CRC_Crypto;
with Zip_Streams;
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Unchecked_Deallocation;
procedure Zip.Compress.Reduce
(input,
output : in out Zip_Streams.Root_Zipstream_Type'Class;
input_size_known: Boolean;
input_size : File_size_type;
feedback : Feedback_proc;
method : Reduction_Method;
CRC : in out Interfaces.Unsigned_32; -- only updated here
crypto : in out Crypto_pack;
output_size : out File_size_type;
compression_ok : out Boolean -- indicates compressed < uncompressed
)
is
reduction_factor: constant Positive:=
1 + Compression_Method'Pos(method) - Compression_Method'Pos(Reduce_1);
use Zip_Streams;
DLE_code: constant:= 144; -- "Escape" character, leading to a Distance - Length code.
subtype Symbol_range is Integer range 0..255;
subtype Follower_range is Symbol_range range 0..31;
-- PKWARE appnote.txt limits to 32 followers.
-- Above 32, you get "PKUNZIP: (W03) Warning! file has bad table"
-- Up to 63 is indeed possible and accepted by unzip <=5.12, WinZip
-- and of course Zip-Ada :-)
-- Optimum with 63 is extremely rare, the gain on test
-- files showing a 63 is 0.02%.
-- Then, we prefer compatibility here.
Followers: array (Symbol_range, Follower_range) of Symbol_range:=
(others=> (others=> 0));
Slen: array (Symbol_range) of Symbol_range;
-- Bits taken by (x-1) mod 256:
B_Table: constant array(Symbol_range) of Integer:=
( 0 => 8,
1..2 => 1,
3..4 => 2,
5..8 => 3,
9..16 => 4,
17..32 => 5,
33..64 => 6,
65..128 => 7,
129..255 => 8 );
------------------
-- Buffered I/O --
------------------
-- Define data types needed to implement input and output file buffers
procedure Dispose is
new Ada.Unchecked_Deallocation(Byte_Buffer, p_Byte_Buffer);
InBuf: p_Byte_Buffer; -- I/O buffers
OutBuf: p_Byte_Buffer;
InBufIdx: Positive; -- Points to next char in buffer to be read
OutBufIdx: Positive; -- Points to next free space in output buffer
MaxInBufIdx: Natural; -- Count of valid chars in input buffer
InputEoF: Boolean; -- End of file indicator
procedure Read_Block is
begin
Zip.BlockRead(
stream => input,
buffer => InBuf.all,
actually_read => MaxInBufIdx
);
InputEoF:= MaxInBufIdx = 0;
InBufIdx := 1;
end Read_Block;
-- Exception for the case where compression works but produces
-- a bigger file than the file to be compressed (data is too "random").
Compression_inefficient: exception;
procedure Write_Block is
amount: constant Integer:= OutBufIdx-1;
begin
output_size:= output_size + File_size_type(Integer'Max(0,amount));
if input_size_known and then output_size >= input_size then
-- The compression so far is obviously unefficient for that file.
-- Useless to go further.
-- Stop immediately before growing the file more than the
-- uncompressed size.
raise Compression_inefficient;
end if;
Encode(crypto, OutBuf(1 .. amount));
Zip.BlockWrite(output, OutBuf(1 .. amount));
OutBufIdx := 1;
end Write_Block;
procedure Put_byte(B : Byte) is
begin
OutBuf(OutBufIdx) := B;
OutBufIdx:= OutBufIdx + 1;
if OutBufIdx > OutBuf'Last then
Write_Block;
end if;
end Put_byte;
--------------------------------------------------------------------------
-----------------
-- Code buffer --
-----------------
Save_byte: Byte; -- Output code buffer
Bits_used: Byte; -- Index into output code buffer
procedure Flush_output is
begin
if Bits_used /= 0 then
Put_byte(Save_byte);
end if;
if OutBufIdx > 1 then
Write_Block;
end if;
end Flush_output;
procedure Put_code(Code : Byte; Code_size: Natural) is
Code_work: Byte;
temp, Save_byte_local, Bits_used_local: Byte;
begin
Code_work:= Code;
temp:= 0;
Save_byte_local:= Save_byte;
Bits_used_local:= Bits_used;
for count in reverse 1 .. Code_size loop
temp:= 0;
if Code_work mod 2 = 1 then
temp:= temp + 1;
end if;
Code_work:= Code_work / 2;
temp:= Shift_Left(temp, Integer(Bits_used_local));
Bits_used_local:= Bits_used_local+1;
Save_byte_local:= Save_byte_local or temp;
if Bits_used_local = 8 then
Put_byte(Save_byte_local);
Save_byte_local:= 0;
temp:= 0;
Bits_used_local:= 0;
end if;
end loop;
Save_byte:= Save_byte_local;
Bits_used:= Bits_used_local;
end Put_code;
procedure Show_symbol(S: Symbol_range) is
begin
if S in 32..126 then
Ada.Text_IO.Put(Character'Val(S));
else
Ada.Text_IO.Put('{' & Symbol_range'Image(S) & '}');
end if;
end Show_symbol;
procedure Save_Followers is
begin
for X in reverse Symbol_range loop
Put_code(Byte(Slen(X)),6); -- max 2**6 followers per symbol
for I in 0 .. Slen(X)-1 loop
Put_code(Byte(Followers(X,I)),8);
end loop;
end loop;
end Save_Followers;
----------------------------------------
-- Probabilistic back-end compression --
----------------------------------------
subtype Count is Integer_32;
markov_d: array(Symbol_range, Symbol_range) of Count:=
(others=> (others=> 0));
-- Build probability of transition from symbol i to symbol j:
--
-- markov(i,j) = P(symbol i is followed by j)
-- = markov_d(i,j) / sum_k( markov_d(i,k))
--
-- Only the discrete matrix markov_d is stored.
total_row: array(Symbol_range) of Count;
-- total_row(i) = sum_k( markov_d(i,k))
order: array(Symbol_range, Symbol_range) of Symbol_range;
use_probas: constant Boolean:= True;
trace : constant Boolean:= False;
-- We use the most significant quantiles of each row of the Markov matrix
-- to allow for a frequent coding that is shorter than the symbol itself.
-- Otherwise, why doing all that ;-) ?...
has_follower: array(Symbol_range, Symbol_range) of Boolean:=
(others => (others => False));
follower_pos: array(Symbol_range, Symbol_range) of Follower_range;
-- follower_pos(a,b) only significant if has_follower(a,b) = True
procedure Show_partial_markov(ordered: Boolean) is
subtype subrange is
Symbol_range range Character'Pos('a')..Character'Pos('i');
sk, min, max: Symbol_range;
begin
if ordered then
min:= Follower_range'First;
max:= Follower_range'Last;
else
min:= subrange'First;
max:= subrange'Last;
end if;
New_Line;
for si in subrange loop
Show_symbol(si);
Put("| ");
for sj in min..max loop
if ordered then -- show top probas
sk:= order(si,sj);
else -- show probas in the same subrange
sk:= sj;
end if;
Show_symbol(sk);
Put(':' & Count'Image(markov_d(si,sj)) & ' ');
end loop;
if ordered then
Put("|" & Count'Image(total_row(si)) & Integer'Image(Slen(si)));
end if;
New_Line;
end loop;
end Show_partial_markov;
procedure Build_Followers is
procedure Swap(a,b: in out Count) is
pragma Inline(Swap);
c: Count;
begin c:= a;a:= b;b:= c; end Swap;
procedure Swap(a,b: in out Symbol_range) is
pragma Inline(Swap);
c: Symbol_range;
begin c:= a;a:= b;b:= c; end Swap;
-- Sort a row (sym_row) in the Markov matrix, largest probabilities first.
-- Original order is kept by the order matrix.
--
procedure Sort(sym_row: Symbol_range) is
-- A stupid bubble sort algo, but one working with sets
-- having big packs of same data (trouble with qsort)...
swapped: Boolean;
left: Symbol_range:= Symbol_range'First;
begin
loop
swapped:= False;
for i in reverse left .. Symbol_range'Last-1 loop
if markov_d(sym_row, i) < markov_d(sym_row, i+1) then
Swap( markov_d(sym_row, i), markov_d(sym_row, i+1));
Swap( order (sym_row, i), order (sym_row, i+1));
swapped:= True;
end if;
end loop;
exit when not swapped;
left:= left+1;
end loop;
end Sort;
cumul: Count;
follo: Symbol_range;
subtype Bit_range is Integer range 0..5;
-- 6 would be possible, PKWARE appnote.txt limits numb.
-- of followers to 2**5
max_follo: constant array(Bit_range) of Integer:=
-- mostly 2**n - 1
-- actual follower range is: 0..max_follo(bits)
( 0 => -1, -- NB: not 0; range is empty here
1 => 1,
2 => 3,
3 => 7,
4 => 15,
5 => 31
-- 6 => 62 -- NB: not 63
);
cumul_per_bit_length: array(Bit_range) of Count:= (others => 0);
subtype Real is Long_Float;
exp_size, min_size: Real;
bits_min: Bit_range;
begin -- Build_Followers
if not use_probas then
Slen:= (others => 0);
return;
end if;
if trace then
Show_partial_markov(False);
end if;
for si in Symbol_range loop
total_row(si):= 0;
for sj in Symbol_range loop
order(si,sj):= sj;
total_row(si):= total_row(si) + markov_d(si,sj);
end loop;
Sort(si);
cumul:= 0;
-- Define all possible followers to symbol si:
for sj in Follower_range loop
cumul:= cumul + markov_d(si,sj);
cumul_per_bit_length(B_Table(sj+1)):= cumul;
-- ^ Overwritten several times. When sj jumps to the next bit length
-- (say, bl+1), cumul_per_bit_length(bl) contains the amount of symbols
-- for the stream to be encoded that can be emitted as immediate
-- successors to symbol si by using follower shortcuts of bit length bl.
follo:= order(si,sj);
Followers(si,sj):= follo;
follower_pos(si, follo):= sj;
end loop;
-- Now we decide to which length we are using the followers
min_size:= Real(total_row(si)) * 8.0;
-- ^ Size of all codes, in bits, when no follower
-- at all is defined for symbol si. In the next
-- loop, we will work to reduce that size.
bits_min:= 0;
for bits in 1 .. Bit_range'Last loop
-- We compute the exact size of reduced output (entire file!) for
-- symbol si when cutting the follower range to 0..2**bits-1.
exp_size:=
Real(cumul_per_bit_length(bits)) * Real(bits+1) +
-- ^ Coded followers
Real(total_row(si)-cumul_per_bit_length(bits)) * 9.0 +
-- ^ All codes outside the follower list will take 8+1 bits
Real(max_follo(bits)+1) * 8.0;
-- ^ Also the follower list at the beginning takes place...
if exp_size < min_size then
-- So far, it is more efficient to encode si's followers with 'bits' bits.
min_size:= exp_size;
bits_min:= bits;
end if;
end loop;
Slen(si):= max_follo(bits_min)+1;
for sj in 0 .. max_follo(bits_min) loop
has_follower(si, Followers(si,sj)):= True;
end loop;
end loop;
if trace then
Show_partial_markov(True);
end if;
end Build_Followers;
--------------------------------
-- LZ77 front-end compression --
--------------------------------
-- Cache for LZ-compressed data, to speedup the 2nd phase:
LZ_cache_size: constant:= 2**18; -- 256KB
type LZ_buffer_range is mod LZ_cache_size;
type LZ_buffer is array(LZ_buffer_range) of Byte; -- circular buffer
type LZ_cache_type is record
buf: LZ_buffer; -- buf's index arithmetic is mod LZ_cache_size
nxt: LZ_buffer_range:= 0; -- position of next byte to be written
cnt: Natural:= 0; -- [0..size]: count of cached bytes
end record;
LZ_cache: LZ_cache_type;
lz77_pos, lz77_size: File_size_type:= 0;
-- Possible ranges for LZ distance and length encoding
-- in the Zip-Reduce format:
subtype Length_range is
Integer range 4 .. 2**(8-reduction_factor)+257;
subtype Distance_range is
Integer range 1 .. (2**reduction_factor)*256;
-- max length max dist
-- 1 385 512
-- 2 321 1024
-- 3 289 2048
-- 4 273 4096
type Phase_type is (stats, compress);
generic
phase: Phase_type;
procedure Encode;
procedure Encode is
using_LZ77: Boolean;
Derail_LZ77: exception;
X_Percent: Natural;
Bytes_in : Natural; -- Count of input file bytes processed
user_aborting: Boolean;
real_pct: constant array(Phase_type) of Integer:= (0, 50);
PctDone: Natural;
function Read_byte return Byte is
b: Byte;
begin
b:= InBuf(InBufIdx);
InBufIdx:= InBufIdx + 1;
if phase = stats then
Zip.CRC_Crypto.Update(CRC, (1=> b));
end if;
Bytes_in:= Bytes_in + 1;
if feedback /= null then
if Bytes_in = 1 then
feedback(real_pct(phase), False, user_aborting);
end if;
if X_Percent > 0 and then
((Bytes_in-1) mod X_Percent = 0
or Bytes_in = Integer(input_size))
then
if input_size_known then
PctDone := real_pct(phase) + Integer( (50.0 * Float( Bytes_in)) / Float(input_size));
feedback(PctDone, False, user_aborting);
else
feedback(real_pct(phase), False, user_aborting);
end if;
if user_aborting then
raise User_abort;
end if;
end if;
end if;
return b;
end Read_byte;
function More_bytes return Boolean is
begin
if InBufIdx > MaxInBufIdx then
Read_Block;
end if;
return not InputEoF;
end More_bytes;
upper_shift: constant Integer:= 2**(8-reduction_factor);
maximum_len_1: constant Integer:= upper_shift - 1;
maximum_len_1_b: constant Byte:= Byte(maximum_len_1);
-- LZ77 params
Look_redfac : constant array(1..4) of Integer:= (31,63,255,191);
-- See za_work.xls, sheet Reduce, for the cooking of these numbers...
Look_Ahead : constant Integer:= Look_redfac(reduction_factor);
String_buffer_size : constant := 2**12; -- 2**n optimizes "mod" to "and"
Threshold : constant := 3;
-- if the DLE coding doesn't fit the format constraints, we
-- need to decode it as a simple sequence of literals
-- before the probabilistic reduction
type Text_Buffer is array ( 0..String_buffer_size+Look_Ahead-1 ) of Byte;
Text_Buf: Text_Buffer;
R: Natural;
last_b: Symbol_range:= 0;
-- Raw byte: post LZ77 / DLE coding, pre probabilistic reduction
procedure Write_raw_byte( b: Byte ) is
curr_b: constant Symbol_range:= Symbol_range(b);
follo: Boolean;
begin
lz77_pos:= lz77_pos + 1;
case phase is
--
when stats =>
markov_d(last_b, curr_b):= markov_d(last_b, curr_b) + 1;
-- We also feed the cache which will be read at the 2nd phase:
LZ_cache.buf(LZ_cache.nxt):= b;
LZ_cache.nxt:= LZ_cache.nxt + 1;
LZ_cache.cnt:= Natural'Min(LZ_cache_size, LZ_cache.cnt + 1);
when compress => -- Probabilistic reduction
if Slen(last_b) = 0 then
-- follower set is empty for this character
Put_code(b, 8);
else
follo:= has_follower(last_b,curr_b);
Put_code(1-Boolean'Pos(follo), 1);
-- ^ Certainly a weakness of this format is that each byte is preceded by
-- a flag signaling "clear text" or compressed.
if follo then
Put_code(Byte(follower_pos(last_b,curr_b)), B_Table( Slen(last_b) ));
else
Put_code(b, 8);
end if;
end if;
end case;
last_b:= curr_b;
if phase = compress and then
using_LZ77 and then
(lz77_size - lz77_pos) < File_size_type(LZ_cache.cnt)
-- We have entered the zone covered by the cache, so no need
-- to continue the LZ77 compression effort: the results are
-- already stored.
then
raise Derail_LZ77;
-- We interrupt the LZ77 compression: data has been already
-- cached upon first pass (phase = stats), no need to redo it.
end if;
end Write_raw_byte;
-- The following procedures, Write_normal_byte and Write_DL_code,
-- are called by the LZ77 compressor
-- Write a normal, "clear-text", character
procedure Write_normal_byte( b: Byte ) is
begin
Write_raw_byte(b);
if b = DLE_code then
-- disambiguate situation where the character happens to have
-- the same 'Pos as the DLE code
Write_raw_byte(0);
end if;
Text_Buf(R):= b;
R:= (R+1) mod String_buffer_size;
end Write_normal_byte;
-- Write a Distance-Length code
procedure Write_DL_code( distance, length: Integer ) is
Copy_start: constant Natural:= (R - distance) mod String_buffer_size;
len: constant Integer:= length - 3;
dis: constant Integer:= distance - 1;
dis_upper: Byte;
begin
if distance in Distance_range and length in Length_range then
Write_raw_byte(DLE_code);
dis_upper:= Byte((dis / 256) * upper_shift);
-- Encode length and upper part of distance
if len < maximum_len_1 then
Write_raw_byte(Byte(len) + dis_upper);
else
Write_raw_byte(maximum_len_1_b + dis_upper);
Write_raw_byte(Byte(len - maximum_len_1));
end if;
-- Encode distance
Write_raw_byte(Byte(dis mod 256));
-- Expand in the circular text buffer to have it up to date
for K in 0..length-1 loop
Text_Buf(R):= Text_Buf((Copy_start+K) mod String_buffer_size);
R:= (R+1) mod String_buffer_size;
end loop;
else
-- Cannot encode this distance-length pair, then expand to output :-(
-- if phase= compress then Put("Aie! (" & distance'img & length'img & ")"); end if;
for K in 0..length-1 loop
Write_normal_byte( Text_Buf((Copy_start+K) mod String_buffer_size) );
end loop;
end if;
end Write_DL_code;
procedure My_LZ77 is
new LZ77.Encode
(String_buffer_size => String_buffer_size,
Look_Ahead => Look_Ahead,
Threshold => Threshold,
Method => LZ77.LZHuf,
-- NB: Method IZ_9 needs exactly the same set of LZ77 parameters as in
-- Deflate. Then the compression is worse, though much faster.
Read_byte => Read_byte,
More_bytes => More_bytes,
Write_literal => Write_normal_byte,
Write_DL_code => Write_DL_code);
procedure Finish_Cache is
i: LZ_buffer_range:= LZ_buffer_range(lz77_pos mod LZ_cache_size);
begin
while lz77_pos < lz77_size loop
Write_raw_byte(LZ_cache.buf(i));
i:= i + 1;
end loop;
end Finish_Cache;
begin -- Encode
Read_Block;
R:= String_buffer_size - Look_Ahead;
Bytes_in := 0;
if input_size_known then
X_Percent := Integer(input_size / 40);
else
X_Percent := 0;
end if;
using_LZ77:= True;
My_LZ77;
exception
when Derail_LZ77 => -- LZ77 compression interrupted because compressed data already cached
using_LZ77:= False;
Finish_Cache;
if feedback /= null then
feedback(100, False, user_aborting);
end if;
end Encode;
procedure Build_stats is new Encode(phase => stats);
procedure Compress is new Encode(phase => compress);
mem: ZS_Index_Type;
begin
-- Allocate input and output buffers.
if input_size_known then
InBuf:= new Byte_Buffer
(1..Integer'Min(Integer'Max(8,Integer(input_size)), buffer_size));
else
InBuf:= new Byte_Buffer(1..buffer_size);
end if;
OutBuf:= new Byte_Buffer(1..buffer_size);
OutBufIdx := 1;
output_size:= 0;
mem:= Index(input);
-- Pass 1: statistics to calibrate the probabilistic expansion
Build_stats;
Set_Index(input, mem); -- go back to beginning of message to compress
Build_Followers;
-- Pass 2: actual compression
Save_byte := 0; -- Initialize output bit buffer
Bits_used := 0;
Save_Followers; -- Emit the compression structure before the compressed message
lz77_size:= lz77_pos;
lz77_pos:= 0;
begin
Compress; -- Emit the compressed message
Flush_output;
compression_ok:= True;
exception
when Compression_inefficient =>
compression_ok:= False;
end;
Dispose(InBuf);
Dispose(OutBuf);
end Zip.Compress.Reduce;
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other related Ada projects
on Gautier's blog.