Back to... Zip-Ada

Source file : zip-compress.adb


with Zip.CRC_Crypto,
     Zip.Compress.Shrink,
     Zip.Compress.Reduce,
     Zip.Compress.Deflate,
     Zip.Compress.LZMA_E;

with Ada.Characters.Handling;           use Ada.Characters.Handling;
with Ada.Numerics.Discrete_Random;
with Ada.Strings.Fixed;                 use Ada.Strings.Fixed;

package body Zip.Compress is

  use Zip_Streams, Zip.CRC_Crypto;

  -------------------
  -- Compress_data --
  -------------------

  procedure Compress_data
   (input,
    output          : in out Zip_Streams.Root_Zipstream_Type'Class;
    input_size_known: Boolean;
    input_size      : File_size_type;
    method          : Compression_Method;
    feedback        : Feedback_proc;
    password        : String;
    content_hint    : Data_content_type;
    CRC             : out Interfaces.Unsigned_32;
    output_size     : out File_size_type;
    zip_type        : out Interfaces.Unsigned_16
   )
  is
    use Interfaces;
    counted: File_size_type;
    user_aborting: Boolean;
    idx_in:  constant ZS_Index_Type:= Index(input);
    idx_out: constant ZS_Index_Type:= Index(output);
    compression_ok: Boolean;
    first_feedback: Boolean:= True;
    --
    is_encrypted: constant Boolean:= password /= "";
    encrypt_pack, mem_encrypt_pack: Crypto_pack;
    encrypt_header: Byte_Buffer(1..12);
    package Byte_soup is new Ada.Numerics.Discrete_Random(Byte);
    use Byte_soup;
    cg: Byte_soup.Generator;
    --
    --  Store data as is, or, if do_write = False, just compute CRC (this is for encryption).
    --
    procedure Store_data(do_write: Boolean) is
      Buffer      : Byte_Buffer (1 .. buffer_size);
      Last_Read   : Natural;
    begin
      zip_type:= compression_format_code.store;
      counted:= 0;
      while not End_Of_Stream(input) loop
        if input_size_known and counted >= input_size then
          exit;
        end if;
        -- Copy data
        BlockRead (input, Buffer, Last_Read);
        counted:= counted + File_size_type (Last_Read);
        Update(CRC, Buffer (1 .. Last_Read));
        if do_write then
          Encode(encrypt_pack, Buffer (1 .. Last_Read));
          BlockWrite(output, Buffer (1 .. Last_Read));
        end if;
        -- Feedback
        if feedback /= null and then
          (first_feedback or (counted mod (2**16)=0) or
          (input_size_known and counted = input_size))
        then
          if input_size_known then
            feedback(
              percents_done =>
                Natural( (100.0 * Float(counted)) / Float(input_size) ),
              entry_skipped => False,
              user_abort    => user_aborting );
          else
            feedback(
              percents_done => 0,
              entry_skipped => False,
              user_abort    => user_aborting );
          end if;
          first_feedback:= False;
          if user_aborting then
            raise User_abort;
          end if;
        end if;
      end loop;
      output_size:= counted;
      compression_ok:= True;
    end Store_data;
    --
    procedure Compress_data_single_method(actual_method: Single_Method) is
    begin
      Init(CRC);
      if is_encrypted then
        Init_keys(encrypt_pack, password);
        Set_mode(encrypt_pack, encrypted);
        --  A bit dumb from Zip spec: we need to know the final CRC in order to set up
        --  the last byte of the encryption header, that allows for detecting if a password
        --  is OK - this, with 255/256 probability of correct detection of a wrong password!
        --  Result: 1st scan of the whole input stream with CRC calculation:
        Store_data(do_write => False);
        Reset(cg);
        for i in 1..11 loop
          encrypt_header(i):= Random(cg);
        end loop;
        encrypt_header(12):= Byte(Shift_Right( Final(CRC), 24 ));
        Set_Index(input, idx_in);
        Init(CRC);
        Encode(encrypt_pack, encrypt_header);
        BlockWrite(output, encrypt_header);
        --
        --  We need to remember at this point the encryption keys in case we need
        --  to rewrite from here (compression failed, store data).
        --
        mem_encrypt_pack:= encrypt_pack;
      else
        Set_mode(encrypt_pack, clear);
      end if;
      --
      --  Dispatch the work to child procedures doing the stream compression
      --  in different formats, depending on the actual compression method.
      --  For example, for methods LZMA_for_JPEG, LZMA_for_WAV, or LZMA_3, we
      --  logically call Zip.Compress.LZMA_E for the job.
      --
      case actual_method is
        --
        when Store =>
          Store_data(do_write => True);
        --
        when Shrink =>
          Zip.Compress.Shrink(
            input, output, input_size_known, input_size, feedback,
            CRC, encrypt_pack, output_size, compression_ok
          );
          zip_type:= compression_format_code.shrink;
        --
        when Reduction_Method =>
          Zip.Compress.Reduce(
            input, output, input_size_known, input_size, feedback,
            actual_method,
            CRC, encrypt_pack, output_size, compression_ok
          );
          zip_type:= compression_format_code.reduce +
            Unsigned_16(
              Compression_Method'Pos(actual_method) -
              Compression_Method'Pos(Reduce_1)
            );
        when Deflation_Method =>
          Zip.Compress.Deflate(
            input, output, input_size_known, input_size, feedback,
            actual_method,
            CRC, encrypt_pack, output_size, compression_ok
          );
          zip_type:= compression_format_code.deflate;
        when LZMA_Method =>
          Zip.Compress.LZMA_E(
            input, output, input_size_known, input_size, feedback,
            actual_method,
            CRC, encrypt_pack, output_size, compression_ok
          );
          zip_type:= compression_format_code.lzma;
      end case;
      CRC:= Final(CRC);
      --
      -- Handle case where compression has been unefficient:
      -- data to be compressed is too "random"; then compressed data
      -- happen to be larger than uncompressed data
      --
      if not compression_ok then
        -- Go back to the beginning and just store the data
        Set_Index(input, idx_in);
        if is_encrypted then
          Set_Index(output, idx_out + 12);
          --  Restore the encryption keys to their state just after the encryption header:
          encrypt_pack:= mem_encrypt_pack;
        else
          Set_Index(output, idx_out);
        end if;
        Init(CRC);
        Store_data(do_write => True);
        CRC:= Final(CRC);
      end if;
      if is_encrypted then
        output_size:= output_size + 12;
      end if;
    end Compress_data_single_method;

    fast_presel: constant Boolean:=
      method = Preselection_1 or (input_size_known and input_size < 22_805);

    data_type_to_LZMA_method: constant array(Data_content_type) of LZMA_Method:=
      (JPEG    => LZMA_for_JPEG,
       ARW_RW2 => LZMA_for_ARW,
       ORF_CR2 => LZMA_for_ORF,
       MP3     => LZMA_for_MP3,
       MP4     => LZMA_for_MP4,
       PGM     => LZMA_for_PGM,
       PPM     => LZMA_for_PPM,
       PNG     => LZMA_for_PNG,
       WAV     => LZMA_for_WAV,
       others  => LZMA_1  --  Fake, should be unused as such.
      );

  begin
    case method is
      --
      when Single_Method =>
        Compress_data_single_method(method);
      --
      when Preselection_Method =>
        case content_hint is
          when Neutral =>  --  No clue about what kind of data
            if input_size_known and input_size < 9_000 then
              Compress_data_single_method(Deflation_Method'Last);  --  Deflate
            elsif fast_presel then
              --  See: Optimum, LZ77 sheet in za_work.xls
              --       or l2_vs_l3.xls with a larger data set.
              Compress_data_single_method(LZMA_2);                 --  LZMA with IZ_10 match finder
            else
              Compress_data_single_method(LZMA_3);                 --  LZMA with BT4 match finder
            end if;
          when ARW_RW2 | ORF_CR2 | MP3 | MP4 | JPEG | PGM | PPM | PNG | WAV =>
            if input_size_known and input_size < 2_250 then
              Compress_data_single_method(Deflation_Method'Last);  --  Deflate
            else
              Compress_data_single_method(data_type_to_LZMA_method(content_hint));
            end if;
          when GIF =>
            if input_size_known and input_size < 350 then
              Compress_data_single_method(Deflate_1);
            else
              Compress_data_single_method(LZMA_for_GIF);
            end if;
          when Zip_in_Zip =>
            if input_size_known and input_size < 1_000 then
              Compress_data_single_method(Deflation_Method'Last);  --  Deflate
            elsif fast_presel then
              Compress_data_single_method(LZMA_2_for_Zip_in_Zip);
            else
              Compress_data_single_method(LZMA_3_for_Zip_in_Zip);
            end if;
          when Source_code =>
            if input_size_known and input_size < 8_000 then
              Compress_data_single_method(Deflation_Method'Last);  --  Deflate
            elsif fast_presel then
              Compress_data_single_method(LZMA_2_for_Source);
            else
              Compress_data_single_method(LZMA_3_for_Source);
            end if;
        end case;
    end case;
  end Compress_data;

  function Guess_type_from_name(name: String) return Data_content_type is
    up: constant String:= To_Upper(name);
    ext_1: constant String:= Tail(up, 2);
    ext_2: constant String:= Tail(up, 3);
    ext_3: constant String:= Tail(up, 4);
    ext_4: constant String:= Tail(up, 5);
  begin
    if ext_3 = ".JPG" or else ext_4 = ".JPEG" then
      return JPEG;
    end if;
    if ext_3 = ".ADA" or else ext_3 = ".ADS" or else ext_3 = ".ADB"
      or else ext_1 = ".C" or else ext_1 = ".H"
      or else ext_3 = ".CPP" or else ext_3 = ".HPP"
      or else ext_3 = ".DEF" or else ext_3 = ".ASM"
      or else ext_4 = ".JAVA" or else ext_2 = ".CS"
      or else ext_3 = ".PAS" or else ext_3 = ".INC" or else ext_2 = ".PP" or else ext_3 = ".LPR"
      or else ext_3 = ".MAK" or else ext_2 = ".IN"
      or else ext_2 = ".SH" or else ext_3 = ".BAT" or else ext_3 = ".CMD"
      or else ext_3 = ".XML" or else ext_3 = ".XSL" or else ext_4 = ".SGML"
      or else ext_3 = ".HTM" or else ext_4 = ".HTML"
      or else ext_2 = ".JS" or else ext_3 = ".LSP"
      or else ext_3 = ".CSV" or else ext_3 = ".SQL"
    then
      return Source_code;
    end if;
    --  Zip archives happen to be zipped...
    if ext_4 = ".EPUB"  --  EPUB: e-book reader format
      or else ext_3 = ".JAR" or else ext_3 = ".ZIP"
      or else ext_3 = ".ODB" or else ext_3 = ".ODS" or else ext_3 = ".ODT"
      or else ext_3 = ".OTR" or else ext_3 = ".OTS" or else ext_3 = ".OTT"
      or else ext_3 = ".CRX" or else ext_3 = ".NTH"
      or else ext_4 = ".DOCX" or else ext_4 = ".PPTX" or else ext_4 = ".XLSX"
    then
      return Zip_in_Zip;
    end if;
    --  Some raw camera picture data
    if ext_3 = ".ORF"          --  Raw Olympus
      or else ext_3 = ".CR2"   --  Raw Canon
      or else ext_3 = ".RAF"   --  Raw Fujifilm
      or else ext_3 = ".SRW"   --  Raw Samsung
    then
      return ORF_CR2;
    end if;
    if ext_3 = ".ARW"          --  Raw Sony
      or else ext_3 = ".RW2"   --  Raw Panasonic
      or else ext_3 = ".NEF"   --  Raw Nikon
      or else ext_3 = ".DNG"   --  Raw Leica, Pentax
      or else ext_3 = ".X3F"   --  Raw Sigma
    then
      return ARW_RW2;
    end if;
    if ext_3 = ".PGM" then
      return PGM;
    end if;
    if ext_3 = ".PPM" then
      return PPM;
    end if;
    if ext_3 = ".MP3" then
      return MP3;
    end if;
    if ext_3 = ".MTS" or else ext_3 = ".MP4" or else ext_3 = ".M4A" or else ext_3 = ".M4P" then
      return MP4;
    end if;
    if ext_3 = ".PNG" then
      return PNG;
    end if;
    if ext_3 = ".GIF" then
      return GIF;
    end if;
    if ext_3 = ".WAV" then
      return WAV;
    end if;
    return Neutral;
  end Guess_type_from_name;

end Zip.Compress;

Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other related Ada projects on Gautier's blog.