Back to... Zip-Ada

Source file : data_segmentation.adb



   1  --  Data_Segmentation
   2  ---------------------
   3  --
   4  --  Pure Ada 2005+ code, 100% portable: OS-, CPU- and compiler- independent.
   5  --
   6  --  The Data_Segmentation package offers tools for splitting data into more
   7  --  homogeneous blocks, with the hope of compressing those blocks better.
   8  --
   9  --  Legal licensing note:
  10  --
  11  --  Copyright (c) 2025 Gautier de Montmollin
  12  --  SWITZERLAND
  13  --
  14  --  Permission is hereby granted, free of charge, to any person obtaining a copy
  15  --  of this software and associated documentation files (the "Software"), to deal
  16  --  in the Software without restriction, including without limitation the rights
  17  --  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  18  --  copies of the Software, and to permit persons to whom the Software is
  19  --  furnished to do so, subject to the following conditions:
  20  --
  21  --  The above copyright notice and this permission notice shall be included in
  22  --  all copies or substantial portions of the Software.
  23  --
  24  --  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25  --  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26  --  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27  --  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28  --  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29  --  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  30  --  THE SOFTWARE.
  31  --
  32  --  NB: this is the MIT License, as found 21-Aug-2016 on the site
  33  --  http://www.opensource.org/licenses/mit-license.php
  34  
  35  with Ada.Numerics.Generic_Elementary_Functions;
  36  
  37  package body Data_Segmentation is
  38  
  39    procedure Segment_by_Entropy (buffer : in Buffer_Type; seg : out Segmentation) is
  40      --  Adapted from extras/entropy_segmentation.adb
  41      type Real is digits 15;
  42      package REF is new Ada.Numerics.Generic_Elementary_Functions (Real);
  43      use REF;
  44      inv_window_size : constant Real := 1.0 / Real (window_size);
  45      len : constant Index := Index (buffer'Length);
  46      seg_point : Index;
  47      freq : array (Alphabet) of Natural := (others => 0);
  48      elem : array (Alphabet) of Real    := (others => 0.0);
  49      function Entropy_Function (p : Real) return Real is (-p * Log (p));
  50      function Prob (b : Alphabet) return Real is (Real (freq (b)) * inv_window_size);
  51      entropy : Real := 0.0;
  52      entropy_mark : Real;
  53      index_mark : Index := 1;
  54      p : Real;
  55      bt : Alphabet;
  56    begin
  57      seg.Clear;
  58      if len > window_size + index_threshold then
  59        for i in 1 .. len loop
  60          --  Fill the sliding window.
  61          bt := buffer (i);
  62          freq (bt) := freq (bt) + 1;
  63          if i = window_size then
  64            --  Compute initial entropy value.
  65            for b in Alphabet loop
  66              p := Prob (b);
  67              if p > 0.0 then
  68                elem (b) := Entropy_Function (p);
  69                entropy := entropy + elem (b);
  70              end if;
  71            end loop;
  72            entropy_mark := entropy;
  73          elsif i > window_size then
  74            --  Adjust entropy for new value coming in.
  75            entropy := entropy - elem (bt);
  76            p := Prob (bt);
  77            --  Note: count (bt) is positive, px too.
  78            elem (bt) := Entropy_Function (p);
  79            entropy := entropy + elem (bt);
  80            --  Adjust entropy for old value disappearing.
  81            bt := buffer (i - window_size);
  82            entropy := entropy - elem (bt);
  83            freq (bt) := freq (bt) - 1;
  84            p := Prob (bt);
  85            if p > 0.0 then
  86              elem (bt) := Entropy_Function (p);
  87              entropy := entropy + elem (bt);
  88            else
  89              elem (bt) := 0.0;
  90            end if;
  91            if abs (entropy - entropy_mark) > Real (discrepancy_threshold) then
  92              seg_point := i - window_size;
  93              if seg_point - index_mark > index_threshold then
  94                seg.Append (seg_point);
  95                index_mark := seg_point;
  96                entropy_mark := entropy;
  97              end if;
  98            end if;
  99          end if;
 100        end loop;
 101      end if;
 102      if len > 0 then
 103        seg.Append (len);
 104      end if;
 105    end Segment_by_Entropy;
 106  
 107  end Data_Segmentation;

Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip). Ada programming.
Some news about Zip-Ada and other Ada projects on Gautier's blog.