Source file : data_segmentation.adb
1 -- Data_Segmentation
2 ---------------------
3 --
4 -- Pure Ada 2005+ code, 100% portable: OS-, CPU- and compiler- independent.
5 --
6 -- The Data_Segmentation package offers tools for splitting data into more
7 -- homogeneous blocks, with the hope of compressing those blocks better.
8 --
9 -- Legal licensing note:
10 --
11 -- Copyright (c) 2025 Gautier de Montmollin
12 -- SWITZERLAND
13 --
14 -- Permission is hereby granted, free of charge, to any person obtaining a copy
15 -- of this software and associated documentation files (the "Software"), to deal
16 -- in the Software without restriction, including without limitation the rights
17 -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 -- copies of the Software, and to permit persons to whom the Software is
19 -- furnished to do so, subject to the following conditions:
20 --
21 -- The above copyright notice and this permission notice shall be included in
22 -- all copies or substantial portions of the Software.
23 --
24 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
30 -- THE SOFTWARE.
31 --
32 -- NB: this is the MIT License, as found 21-Aug-2016 on the site
33 -- http://www.opensource.org/licenses/mit-license.php
34
35 with Ada.Numerics.Generic_Elementary_Functions;
36
37 package body Data_Segmentation is
38
39 procedure Segment_by_Entropy (buffer : in Buffer_Type; seg : out Segmentation) is
40 -- Adapted from extras/entropy_segmentation.adb
41 type Real is digits 15;
42 package REF is new Ada.Numerics.Generic_Elementary_Functions (Real);
43 use REF;
44 inv_window_size : constant Real := 1.0 / Real (window_size);
45 len : constant Index := Index (buffer'Length);
46 seg_point : Index;
47 freq : array (Alphabet) of Natural := (others => 0);
48 elem : array (Alphabet) of Real := (others => 0.0);
49 function Entropy_Function (p : Real) return Real is (-p * Log (p));
50 function Prob (b : Alphabet) return Real is (Real (freq (b)) * inv_window_size);
51 entropy : Real := 0.0;
52 entropy_mark : Real;
53 index_mark : Index := 1;
54 p : Real;
55 bt : Alphabet;
56 begin
57 seg.Clear;
58 if len > window_size + index_threshold then
59 for i in 1 .. len loop
60 -- Fill the sliding window.
61 bt := buffer (i);
62 freq (bt) := freq (bt) + 1;
63 if i = window_size then
64 -- Compute initial entropy value.
65 for b in Alphabet loop
66 p := Prob (b);
67 if p > 0.0 then
68 elem (b) := Entropy_Function (p);
69 entropy := entropy + elem (b);
70 end if;
71 end loop;
72 entropy_mark := entropy;
73 elsif i > window_size then
74 -- Adjust entropy for new value coming in.
75 entropy := entropy - elem (bt);
76 p := Prob (bt);
77 -- Note: count (bt) is positive, px too.
78 elem (bt) := Entropy_Function (p);
79 entropy := entropy + elem (bt);
80 -- Adjust entropy for old value disappearing.
81 bt := buffer (i - window_size);
82 entropy := entropy - elem (bt);
83 freq (bt) := freq (bt) - 1;
84 p := Prob (bt);
85 if p > 0.0 then
86 elem (bt) := Entropy_Function (p);
87 entropy := entropy + elem (bt);
88 else
89 elem (bt) := 0.0;
90 end if;
91 if abs (entropy - entropy_mark) > Real (discrepancy_threshold) then
92 seg_point := i - window_size;
93 if seg_point - index_mark > index_threshold then
94 seg.Append (seg_point);
95 index_mark := seg_point;
96 entropy_mark := entropy;
97 end if;
98 end if;
99 end if;
100 end loop;
101 end if;
102 if len > 0 then
103 seg.Append (len);
104 end if;
105 end Segment_by_Entropy;
106
107 end Data_Segmentation;
Web view of Ada source code generated by GNATHTML, project: ALI_Parse version 1.0.
Zip-Ada: Ada library for zip archive files (.zip).
Ada programming.
Some news about Zip-Ada and other Ada projects
on Gautier's blog.