Maybe I forgot to commit

2023-08-08 12:28:44 +02:00
commit 8795ab8735
15 changed files with 1975 additions and 0 deletions
--- a/.cargo/config
+++ b/.cargo/config
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+.idea
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "SP800-90B_EntropyAssessment"]
+	path = SP800-90B_EntropyAssessment
+	url = https://github.com/usnistgov/SP800-90B_EntropyAssessment
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "SP800-90B-rs"
+version = "0.1.0"
+build = "build.rs"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+autocxx = "0.26.0"
+cxx = "1.0"
+rand = {version = "0.8", features=["min_const_gen"]}
+
+[build-dependencies]
+autocxx-build = "0.26.0"
+miette = { version = "5", features = ["fancy"] } # optional but gives nicer error messages!dependencies]
--- a/1
+++ b/1
--- a/build.rs
+++ b/build.rs
@@ -0,0 +1,19 @@
+fn main() -> miette::Result<()> {
+    let path = std::path::PathBuf::from("src/cpp"); // include path
+    // let path = std::path::PathBuf::from("src/cpp"); // include path
+    println!("cargo:rustc-link-search=SP800-90B_EntropyAssessment/cpp");
+
+    let mut b = autocxx_build::Builder::new("src/main.rs", &[&path]).build()?;
+    // This assumes all your C++ bindings are in main.rs
+    b.flag_if_supported("-std=c++14")
+        .compile("SP800-90B-rs"); // arbitrary library name, pick anything
+    println!("cargo:rerun-if-changed=cpp/main.rs");
+    // Add instructions to link to any C++ libraries you need.
+    println!("cargo:rustc-link-lib=bz2");
+    println!("cargo:rustc-link-lib=pthread");
+    println!("cargo:rustc-link-lib=divsufsort");
+    println!("cargo:rustc-link-lib=jsoncpp");
+    println!("cargo:rustc-link-lib=mpfr");
+    println!("cargo:rustc-link-lib=gmp");
+    Ok(())
+}
--- a/src/cpp/bindings.cpp
+++ b/src/cpp/bindings.cpp
@@ -0,0 +1,16 @@
+#include "bindings.hpp"
+#include <cstdint>
+int main (int argc, char *argv[]) {
+    A a = A();
+    a.set(8);
+    printf("%d\n",a.get());
+    return 0;
+}
+
+void A::set(uint32_t val){
+   a = val; 
+}
+
+uint32_t A::get() const{
+    return a;
+}
--- a/src/cpp/bindings.cpp.bk
+++ b/src/cpp/bindings.cpp.bk
@@ -0,0 +1,62 @@
+#include <pybind11/pybind11.h>
+
+#include "data.hpp"
+
+namespace py = pybind11;
+using namespace py::literals;
+
+PYBIND11_MODULE(sp800_90b, m) {
+
+    m.doc() = "SP 800-90B entropy assesment, using the nist-provided implementation.";
+
+    py::class_<Data>(m, "Data", "Samples that can be assessed for entropy.")
+
+        .def(
+            py::init<py::bytes, int, bool>(),
+            "Constructs entropy assessment sample data.",
+            "samples"_a, "bits_per_symbol"_a = 8, "truncate"_a = false
+        )
+
+        .def_readonly("is_binary", &Data::is_binary, "Whether the data only consisty of two different symbols.")
+
+        .def_readonly("rawmean", &Data::rawmean, "Statistical mean value of the data.")
+        .def_readonly("median", &Data::median, "Statistical median value of the data.")
+
+        .def("iid_tests", &Data::iid_tests, "Tests if the data is independently and identically distributed using `chi_square_test`, `lrs_tests` and `permutation_tests`.")
+        .def("chi_square_tests", &Data::chi_square_tests, "Tests if the data is independently and identically distributed using Pearson's chi-squared test.")
+        .def("lrs_tests", &Data::lrs_tests, "Tests if the data is independently and identically distributed using an LSR test.")
+        .def("permutation_tests", &Data::permutation_tests, "Tests if the data is independently and identically distributed using permutations.")
+
+        .def("h_initial", &Data::h_initial, "Computes the initial entropy estimate (Section 3.1.3).")
+        .def("h_conditioned", &Data::h_conditioned, "Computes the entropy estimate foa a conditioned seqauential dataset (Section 3.1.5.2).")
+        .def("h_both", &Data::h_both, "Computes both entropy estimates.")
+
+        .def("h_min_all", &Data::h_min_all, "Computes the minimum of the entropy estimates for the original data.")
+
+        .def("h_max", &Data::h_max, "The maximal entropy for the dataset (word_size).")
+        .def("h_most_common", &Data::h_most_common, "Estimates the entropy with the most common value (Section 3.6.1).")
+        .def("h_collision", &Data::h_collision, "Estimates the entropy with a collision test (Section 6.3.2).")
+        .def("h_markov", &Data::h_markov, "Estimates the entropy with a markov test (Section 6.3.3).")
+        .def("h_compression", &Data::h_compression, "Estimates the entropy with a compression test (Section 6.3.4).")
+        .def("h_t_tuple_and_lrs", &Data::h_t_tuple_and_lrs, "Estimates the entropy with a t-tuple and a lrs test (Sections 6.3.5 and 6.3.6).")
+        .def("h_multi_most_common", &Data::h_multi_most_common, "Estimates the entropy with a multi most common in window test (Section 6.3.7).")
+        .def("h_lag_prediction", &Data::h_lag_prediction, "Estimates the entropy with a lag prediction test (Section 6.3.8).")
+        .def("h_multi_markov", &Data::h_multi_markov, "Estimates the entropy with a multi markov model with counting test (Section 6.3.9).")
+        .def("h_lz78y", &Data::h_lz78y, "Estimates the entropy with a LZ78Y test (Section 6.3.10).")
+
+
+        .def("h_bitstring_min_all", &Data::h_bitstring_min_all, "Computes the minimum of the entropy estimates for the data in bitstring form.")
+
+        .def("h_bitstring_max", &Data::h_bitstring_max, "The maximal entropy for the bitstring data (is 1.0).")
+        .def("h_bitstring_most_common", &Data::h_bitstring_most_common, "Estimates the entropy of the data as a bistring with the most common value (Section 3.6.1).")
+        .def("h_bitstring_collision", &Data::h_bitstring_collision, "Estimates the entropy of the data as a bistring with a collision test (Section 6.3.2).")
+        .def("h_bitstring_markov", &Data::h_bitstring_markov, "Estimates the entropy of the data as a bistring with a markov test (Section 6.3.3).")
+        .def("h_bitstring_compression", &Data::h_bitstring_compression, "Estimates the entropy of the data as a bistring with a compression test (Section 6.3.4).")
+        .def("h_bitstring_t_tuple_and_lrs", &Data::h_bitstring_t_tuple_and_lrs, "Estimates the entropy of the data as a bistring with a t-tuple and a lrs test (Sections 6.3.5 and 6.3.6).")
+        .def("h_bitstring_multi_most_common", &Data::h_bitstring_multi_most_common, "Estimates the entropy of the data as a bistring with a multi most common in window test (Section 6.3.7).")
+        .def("h_bitstring_lag_prediction", &Data::h_bitstring_lag_prediction, "Estimates the entrop of the data as a bistringy with a lag prediction test (Section 6.3.8).")
+        .def("h_bitstring_multi_markov", &Data::h_bitstring_multi_markov, "Estimates the entropy of the data as a bistring with a multi markov model with counting test (Section 6.3.9).")
+        .def("h_bitstring_lz78y", &Data::h_bitstring_lz78y, "Estimates the entropy of the data as a bistring with a LZ78Y test (Section 6.3.10).")
+    ;
+
+}
--- a/src/cpp/bindings.hpp
+++ b/src/cpp/bindings.hpp
@@ -0,0 +1,18 @@
+
+#pragma once
+
+#include "data.hpp"
+// #include <cstdint>
+// #include <stdint.h>
+// #include <string>
+// struct A {
+//     A() {}
+//     void set(uint32_t val){
+//         a = val;
+//     };
+//     uint32_t get() const;
+//     uint32_t a;
+// };
+
+
+
--- a/src/cpp/data.cpp.bk
+++ b/src/cpp/data.cpp.bk
@@ -0,0 +1,182 @@
+#include "data.hpp"
+
+data_t build_data(const char * samples, int lenght, int bits_per_symbol, bool truncate) {
+
+    // constructing data
+    data_t data = construct_data_t(samples, lenght, bits_per_symbol);
+
+    // some sanity check
+	if (data.alph_size <= 1)
+		throw std::invalid_argument("Symbol alphabet consists of 1 symbol. No entropy awarded...");
+
+    // truncate to min size
+	if (truncate && (data.blen > MIN_SIZE))
+        data.blen = MIN_SIZE;
+
+    // warn if not min size
+	if (data.len < MIN_SIZE)
+        printf("\n*** Warning: data contains less than %d (min size) samples ***\n\n", MIN_SIZE);
+
+    return data;
+}
+
+Data::Data(const char * samples,int length, int bits_per_symbol, bool truncate)
+  : Data(build_data(samples, length, bits_per_symbol, truncate)) {}
+
+bool Data::iid_tests() const {
+    return
+        chi_square_tests()
+        && lrs_tests()
+        && permutation_tests();
+}
+
+bool Data::chi_square_tests() const {
+    return ::chi_square_tests(this->data.symbols, this->data.len, this->data.alph_size, 0);
+}
+
+bool Data::lrs_tests() const {
+    return len_LRS_test(this->data.symbols, this->data.len, this->data.alph_size, 0, "Literal");
+}
+
+bool Data::permutation_tests() const {
+	return ::permutation_tests(&this->data, this->rawmean, this->median, 0);
+}
+
+double Data::h_max() const {
+    return data.word_size;
+}
+
+double Data::h_bitstring_max() const {
+    return 1.0;
+}
+
+double Data::h_most_common() const {
+    return most_common(data.symbols, data.len, data.alph_size, 0, "Literal");
+}
+
+double Data::h_bitstring_most_common() const {
+    return most_common(data.bsymbols, data.blen, 2, 0, "Bitstring");
+}
+
+double Data::h_multi_most_common() const {
+    return multi_mcw_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+}
+
+double Data::h_bitstring_multi_most_common() const {
+    return multi_mcw_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+}
+
+double Data::h_collision() const {
+    return collision_test(data.symbols, data.len, 0, "Literal");
+}
+
+double Data::h_bitstring_collision() const {
+    return collision_test(data.bsymbols, data.blen, 0, "Bitstring");
+}
+
+double Data::h_markov() const {
+    return markov_test(data.symbols, data.len, 0, "Literal");
+}
+
+double Data::h_bitstring_markov() const {
+    return markov_test(data.bsymbols, data.blen, 0, "Bitstring");
+}
+
+double Data::h_multi_markov() const {
+    return multi_mmc_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+}
+
+double Data::h_bitstring_multi_markov() const {
+    return multi_mmc_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+}
+
+double Data::h_compression() const {
+    return compression_test(data.symbols, data.len, 0, "Literal");
+}
+
+double Data::h_bitstring_compression() const {
+    return compression_test(data.bsymbols, data.blen, 0, "Bitstring");
+}
+
+std::pair<double, double> Data::h_t_tuple_and_lrs() const {
+    double t_tuple_res = -1.0, lrs_res = -1.0;
+    SAalgs(data.symbols, data.len, data.alph_size, t_tuple_res, lrs_res, 0, "Literal");
+    return std::make_pair(t_tuple_res, lrs_res);
+}
+
+std::pair<double, double> Data::h_bitstring_t_tuple_and_lrs() const {
+    double t_tuple_res = -1.0, lrs_res = -1.0;
+    SAalgs(data.bsymbols, data.blen, 2, t_tuple_res, lrs_res, 0, "Bitstring");
+    return std::make_pair(t_tuple_res, lrs_res);
+}
+
+double Data::h_lag_prediction() const {
+    return lag_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+}
+
+double Data::h_bitstring_lag_prediction() const {
+    return lag_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+}
+
+double Data::h_lz78y() const {
+    return LZ78Y_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+}
+
+double Data::h_bitstring_lz78y() const {
+    return LZ78Y_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+}
+
+double Data::h_min_all() const {
+    double h = h_max();
+    h = min(h, h_most_common());
+    if (is_binary)
+        h = min(h, h_collision());
+    if (is_binary)
+        h = min(h, h_markov());
+    if (is_binary)
+        h = min(h, h_compression());
+    auto tup_and_lrs = h_t_tuple_and_lrs();
+    if (tup_and_lrs.first >= 0.0)
+        h = min(h, tup_and_lrs.first);
+    if (tup_and_lrs.second >= 0.0)
+        h = min(h, tup_and_lrs.second);
+    h = min(h, h_multi_most_common());
+    h = min(h, h_lag_prediction());
+    h = min(h, h_multi_markov());
+    h = min(h, h_lz78y());
+    return h;
+}
+
+double Data::h_bitstring_min_all() const {
+    double h = h_bitstring_max();
+    h = min(h, h_bitstring_most_common());
+    h = min(h, h_bitstring_collision());
+    h = min(h, h_bitstring_markov());
+    h = min(h, h_bitstring_compression());
+    auto tup_and_lrs = h_bitstring_t_tuple_and_lrs();
+    if (tup_and_lrs.first >= 0.0)
+        h = min(h, tup_and_lrs.first);
+    if (tup_and_lrs.second >= 0.0)
+        h = min(h, tup_and_lrs.second);
+    h = min(h, h_bitstring_multi_most_common());
+    h = min(h, h_bitstring_lag_prediction());
+    h = min(h, h_bitstring_multi_markov());
+    h = min(h, h_bitstring_lz78y());
+    return h;
+}
+
+double Data::h_initial() const {
+    if (is_binary) return h_min_all();
+    return min(h_min_all(), h_conditioned());
+}
+
+double Data::h_conditioned() const {
+    return data.word_size * h_bitstring_min_all();
+}
+
+std::pair<double, double> Data::h_both() const {
+    double h_cond = h_conditioned();
+    return std::make_pair(min(h_min_all(), h_cond), h_cond);
+}
+
+
--- a/src/cpp/data.hpp
+++ b/src/cpp/data.hpp
@@ -0,0 +1,218 @@
+#ifndef DATA_HPP
+#define DATA_HPP
+
+#include <utility>
+
+#include "nist.hpp"
+
+data_t build_data(const char *samples, int lenght, int bits_per_symbol,
+                  bool truncate) {
+
+  // constructing data
+  data_t data = construct_data_t(samples, lenght, bits_per_symbol);
+
+  // some sanity check
+  if (data.alph_size <= 1)
+    throw std::invalid_argument(
+        "Symbol alphabet consists of 1 symbol. No entropy awarded...");
+
+  // truncate to min size
+  if (truncate && (data.blen > MIN_SIZE))
+    data.blen = MIN_SIZE;
+
+  // warn if not min size
+  if (data.len < MIN_SIZE)
+    printf(
+        "\n*** Warning: data contains less than %d (min size) samples ***\n\n",
+        MIN_SIZE);
+
+  return data;
+}
+
+class Data {
+private:
+  data_t data;
+
+public:
+  const double rawmean;
+  const double median;
+  const bool is_binary;
+
+private:
+  Data(data_t data, Stats stats)
+      : data(data), rawmean(stats.rawmean), median(stats.median),
+        is_binary(data.alph_size == 2) {}
+
+  Data(data_t data) : Data(data, Stats(data)) {}
+
+public:
+  Data(const char *samples, int length, int bits_per_symbol = 8,
+       bool truncate = false)
+      : Data(build_data(samples, length, bits_per_symbol, truncate)){};
+  ~Data() { free_data(&data); }
+
+  // distribution tests
+    bool iid_tests() const {
+        IidTestCase tc;
+        tc.mean = rawmean;
+        tc.median = median;
+        tc.binary = is_binary;
+
+        bool cst = chi_square_tests_();
+        return
+            chi_square_tests_()
+            && lrs_tests_()
+            && permutation_tests_();
+    }
+
+    bool chi_square_tests_() const {
+        return chi_square_tests(this->data.symbols, this->data.len, this->data.alph_size, 0);
+    }
+
+    bool lrs_tests_() const {
+        return len_LRS_test(this->data.symbols, this->data.len, this->data.alph_size, 0, "Literal");
+    }
+
+    bool permutation_tests_() const {
+        return permutation_tests(&this->data, this->rawmean, this->median, 0);
+    }
+
+  // entropy estimates
+  double h_max() const {
+    return data.word_size;
+    }
+
+    double h_bitstring_max() const {
+        return 1.0;
+    }
+
+    double h_most_common() const {
+        return most_common(data.symbols, data.len, data.alph_size, 0, "Literal");
+    }
+
+    double h_bitstring_most_common() const {
+        return most_common(data.bsymbols, data.blen, 2, 0, "Bitstring");
+    }
+
+    double h_multi_most_common() const {
+        return multi_mcw_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+    }
+
+    double h_bitstring_multi_most_common() const {
+        return multi_mcw_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+    }
+
+    double h_collision() const {
+        return collision_test(data.symbols, data.len, 0, "Literal");
+    }
+
+    double h_bitstring_collision() const {
+        return collision_test(data.bsymbols, data.blen, 0, "Bitstring");
+    }
+
+    double h_markov() const {
+        return markov_test(data.symbols, data.len, 0, "Literal");
+    }
+
+    double h_bitstring_markov() const {
+        return markov_test(data.bsymbols, data.blen, 0, "Bitstring");
+    }
+
+    double h_multi_markov() const {
+        return multi_mmc_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+    }
+
+    double h_bitstring_multi_markov() const {
+        return multi_mmc_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+    }
+
+    double h_compression() const {
+        return compression_test(data.symbols, data.len, 0, "Literal");
+    }
+
+    double h_bitstring_compression() const {
+        return compression_test(data.bsymbols, data.blen, 0, "Bitstring");
+    }
+
+    std::pair<double, double> h_t_tuple_and_lrs() const {
+        double t_tuple_res = -1.0, lrs_res = -1.0;
+        SAalgs(data.symbols, data.len, data.alph_size, t_tuple_res, lrs_res, 0, "Literal");
+        return std::make_pair(t_tuple_res, lrs_res);
+    }
+
+    std::pair<double, double> h_bitstring_t_tuple_and_lrs() const {
+        double t_tuple_res = -1.0, lrs_res = -1.0;
+        SAalgs(data.bsymbols, data.blen, 2, t_tuple_res, lrs_res, 0, "Bitstring");
+        return std::make_pair(t_tuple_res, lrs_res);
+    }
+
+    double h_lag_prediction() const {
+        return lag_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+    }
+
+    double h_bitstring_lag_prediction() const {
+        return lag_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+    }
+
+    double h_lz78y() const {
+        return LZ78Y_test(data.symbols, data.len, data.alph_size, 0, "Literal");
+    }
+
+    double h_bitstring_lz78y() const {
+        return LZ78Y_test(data.bsymbols, data.blen, 2, 0, "Bitstring");
+    }
+
+    double h_min_all() const {
+        double h = h_max();
+        h = min(h, h_most_common());
+        if (is_binary)
+            h = min(h, h_collision());
+        if (is_binary)
+            h = min(h, h_markov());
+        if (is_binary)
+            h = min(h, h_compression());
+        auto tup_and_lrs = h_t_tuple_and_lrs();
+        if (tup_and_lrs.first >= 0.0)
+            h = min(h, tup_and_lrs.first);
+        if (tup_and_lrs.second >= 0.0)
+            h = min(h, tup_and_lrs.second);
+        h = min(h, h_multi_most_common());
+        h = min(h, h_lag_prediction());
+        h = min(h, h_multi_markov());
+        h = min(h, h_lz78y());
+        return h;
+    }
+
+    double h_bitstring_min_all() const {
+        double h = h_bitstring_max();
+        h = min(h, h_bitstring_most_common());
+        h = min(h, h_bitstring_collision());
+        h = min(h, h_bitstring_markov());
+        h = min(h, h_bitstring_compression());
+        auto tup_and_lrs = h_bitstring_t_tuple_and_lrs();
+        if (tup_and_lrs.first >= 0.0)
+            h = min(h, tup_and_lrs.first);
+        if (tup_and_lrs.second >= 0.0)
+            h = min(h, tup_and_lrs.second);
+        h = min(h, h_bitstring_multi_most_common());
+        h = min(h, h_bitstring_lag_prediction());
+        h = min(h, h_bitstring_multi_markov());
+        h = min(h, h_bitstring_lz78y());
+        return h;
+    }
+
+    double h_initial() const {
+        if (is_binary) return h_min_all();
+        return min(h_min_all(), h_conditioned());
+    }
+
+    double h_conditioned() const {
+        return data.word_size * h_bitstring_min_all();
+    }
+
+    std::pair<double, double> h_both() const {
+        double h_cond = h_conditioned();
+        return std::make_pair(min(h_min_all(), h_cond), h_cond);
+    }
+};
+#endif
--- a/src/cpp/nist.cpp.bk
+++ b/src/cpp/nist.cpp.bk
@@ -0,0 +1,127 @@
+#include <shared/utils.h>
+
+#define NO_NIST_DEFS
+#include "nist.hpp"
+
+Stats::Stats(data_t &data) {
+    calc_stats(&data, this->rawmean, this->median);
+}
+
+#include <shared/most_common.h>
+#include <shared/lrs_test.h>
+
+#include <iid/permutation_tests.h>
+#include <iid/chi_square_tests.h>
+
+#include <non_iid/collision_test.h>
+#include <non_iid/lz78y_test.h>
+#include <non_iid/multi_mmc_test.h>
+#include <non_iid/lag_test.h>
+#include <non_iid/multi_mcw_test.h>
+#include <non_iid/compression_test.h>
+#include <non_iid/markov_test.h>
+
+#include <omp.h>
+#include <limits.h>
+
+data_t construct_data_t(
+    const char *symbols,
+    unsigned long symbols_len,
+    int bits_per_word
+) {
+
+	if (symbols_len == 0)
+        throw std::invalid_argument("Need more than zero symbols!");
+
+    data_t data;
+
+    ////////////////////////////////////////////////////
+    // init symbols
+    data.len = symbols_len;
+	data.symbols = (byte*) malloc(data.len);
+	if (data.symbols == NULL) {
+        free_data(&data);
+        throw std::runtime_error("Failure to initialize memory for symbols!");
+	}
+    memcpy(data.symbols, symbols, data.len);
+
+    ////////////////////////////////////////////////////
+    // init wordsize
+    data.word_size = bits_per_word;
+
+    // check bits per word with the actual symbols
+	byte datamask = 0;
+	byte curbit = 0x80;
+
+	for(int i = 0; i < data.len; i++) {
+		datamask = datamask | data.symbols[i];
+	}
+
+    int calculated_wordsize;
+	for(calculated_wordsize=8; (calculated_wordsize>0) && ((datamask & curbit) == 0); calculated_wordsize--) {
+		curbit = curbit >> 1;
+	}
+
+	if( calculated_wordsize < data.word_size ) {
+		printf("Warning: Symbols appear to be narrower than described.\n");
+	} else if( calculated_wordsize > data.word_size ) {
+		free_data(&data);
+        throw std::invalid_argument("Incorrect bit width specification: Data does not fit within described bit width.\n");
+	}
+
+    ////////////////////////////////////////////////////
+    // init rawsymbols
+	data.rawsymbols = (byte*) malloc(data.len);
+	if(data.rawsymbols == NULL){
+		free_data(&data);
+        throw std::runtime_error("Failure to initialize memory for rawsymbols!");
+	}
+	memcpy(data.rawsymbols, data.symbols, data.len);
+
+    ////////////////////////////////////////////////////
+    // init max symbols and create symbol map down table
+	data.maxsymbol = 0;
+	int max_symbols = 1 << data.word_size;
+	int symbol_map_down_table[max_symbols];
+
+	// create symbols (samples) and check if they need to be mapped down
+	data.alph_size = 0;
+	memset(symbol_map_down_table, 0, max_symbols*sizeof(int));
+	int mask = max_symbols-1;
+	for(int i = 0; i < data.len; i++){ 
+		data.symbols[i] &= mask;
+		if(data.symbols[i] > data.maxsymbol) data.maxsymbol = data.symbols[i];
+		if(symbol_map_down_table[data.symbols[i]] == 0) symbol_map_down_table[data.symbols[i]] = 1;
+	}
+
+	for(int i = 0; i < max_symbols; i++){
+		if(symbol_map_down_table[i] != 0) symbol_map_down_table[i] = (byte)data.alph_size++;
+	}
+
+    ////////////////////////////////////////////////////
+	// create bsymbols (bitstring) using the non-mapped data
+	data.blen = data.len * data.word_size;
+	if(data.word_size == 1) data.bsymbols = data.symbols;
+	else{
+		data.bsymbols = (byte*)malloc(data.blen);
+		if(data.bsymbols == NULL){
+			throw std::runtime_error("failure to initialize memory for bsymbols!");
+			free_data(&data);
+		}
+
+		for(int i = 0; i < data.len; i++){
+			for(int j = 0; j < data.word_size; j++){
+				data.bsymbols[i*data.word_size+j] = (data.symbols[i] >> (data.word_size-1-j)) & 0x1;
+			}
+		}
+	}
+
+    ////////////////////////////////////////////////////
+	// map down symbols if less than 2^bits_per_word unique symbols
+	if(data.alph_size < data.maxsymbol + 1){
+		for(int i = 0; i < data.len; i++) data.symbols[i] = (byte)symbol_map_down_table[data.symbols[i]];
+	} 
+
+	return data;
+}
+
--- a/src/cpp/nist.hpp
+++ b/src/cpp/nist.hpp
@@ -0,0 +1,196 @@
+#ifndef NIST_HPP
+#define NIST_HPP
+
+#include "../../SP800-90B_EntropyAssessment/cpp/shared/utils.h"
+
+#define NO_NIST_DEFS
+
+#ifndef NO_NIST_DEFS
+#define MIN_SIZE 1000000
+#define min(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a < _b ? _a : _b;                                                         \
+  })
+
+typedef unsigned char byte;
+typedef struct data_t data;
+
+struct data_t {
+    int word_size;      // bits per symbol
+    int alph_size;      // symbol alphabet size
+    byte maxsymbol;     // the largest symbol present in the raw data stream
+    byte *rawsymbols;   // raw data words
+    byte *symbols;      // data words
+    byte *bsymbols;     // data words as binary string
+    long len;       // number of words in data
+    long blen;      // number of bits in data
+};
+
+#endif
+typedef unsigned char byte;
+typedef struct data_t data;
+
+#include "../../SP800-90B_EntropyAssessment/cpp/shared/most_common.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/shared/lrs_test.h"
+
+#include "../../SP800-90B_EntropyAssessment/cpp/iid/iid_test_case.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/iid/permutation_tests.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/iid/chi_square_tests.h"
+
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/collision_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/lz78y_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/multi_mmc_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/lag_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/multi_mcw_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/compression_test.h"
+#include "../../SP800-90B_EntropyAssessment/cpp/non_iid/markov_test.h"
+
+#include <omp.h>
+#include <limits.h>
+
+class Stats {
+public:
+  double rawmean, median;
+  Stats(data_t &data)  { calc_stats(&data, this->rawmean, this->median); };
+};
+
+
+void SAalgs(const byte data[], const long len, const int alph_size,
+            double &t_tuple_res, double &lrs_res, const int verbose,
+            const char *label);
+
+double most_common(byte *data, const long len, const int alph_size,
+                   const int verbose, const char *label);
+double multi_mcw_test(byte *data, const long len, const int alph_size,
+                      const int verbose, const char *label);
+double collision_test(byte *data, const long len, const int verbose,
+                      const char *label);
+double markov_test(byte *data, const long len, const int verbose,
+                   const char *label);
+double multi_mmc_test(byte *data, const long len, const int alph_size,
+                      const int verbose, const char *label);
+double compression_test(byte *data, const long len, const int verbose,
+                        const char *label);
+double lag_test(byte *data, const long len, const int alph_size,
+                const int verbose, const char *label);
+double LZ78Y_test(byte *data, const long len, const int alph_size,
+                  const int verbose, const char *label);
+
+data_t construct_data_t(const char *samples, int sample_length,
+                        int bits_per_symbol);
+void free_data(data_t *data);
+
+bool len_LRS_test(const byte data[], const int length, const int k,
+                  const int verbosity, const char *label);
+
+bool chi_square_tests(const byte data[], const int length, int alph_size,
+                      const int verbosity);
+
+bool permutation_tests(const data_t *data, double rawmean, double median,
+                       int verbosity);
+
+data_t construct_data_t(
+    const char *symbols,
+    int symbols_len,
+    int bits_per_word
+) {
+
+	if (symbols_len == 0)
+        throw std::invalid_argument("Need more than zero symbols!");
+
+    data_t data;
+
+    ////////////////////////////////////////////////////
+    // init symbols
+    data.len = symbols_len;
+	data.symbols = (byte*) malloc(data.len);
+	if (data.symbols == NULL) {
+        free_data(&data);
+        throw std::runtime_error("Failure to initialize memory for symbols!");
+	}
+    memcpy(data.symbols, symbols, data.len);
+
+    ////////////////////////////////////////////////////
+    // init wordsize
+    data.word_size = bits_per_word;
+
+    // check bits per word with the actual symbols
+	byte datamask = 0;
+	byte curbit = 0x80;
+
+	for(int i = 0; i < data.len; i++) {
+		datamask = datamask | data.symbols[i];
+	}
+
+    int calculated_wordsize;
+	for(calculated_wordsize=8; (calculated_wordsize>0) && ((datamask & curbit) == 0); calculated_wordsize--) {
+		curbit = curbit >> 1;
+	}
+
+	if( calculated_wordsize < data.word_size ) {
+		printf("Warning: Symbols appear to be narrower than described.\n");
+	} else if( calculated_wordsize > data.word_size ) {
+		free_data(&data);
+        throw std::invalid_argument("Incorrect bit width specification: Data does not fit within described bit width.\n");
+	}
+
+    ////////////////////////////////////////////////////
+    // init rawsymbols
+	data.rawsymbols = (byte*) malloc(data.len);
+	if(data.rawsymbols == NULL){
+		free_data(&data);
+        throw std::runtime_error("Failure to initialize memory for rawsymbols!");
+	}
+	memcpy(data.rawsymbols, data.symbols, data.len);
+
+    ////////////////////////////////////////////////////
+    // init max symbols and create symbol map down table
+	data.maxsymbol = 0;
+	int max_symbols = 1 << data.word_size;
+	int symbol_map_down_table[max_symbols];
+
+	// create symbols (samples) and check if they need to be mapped down
+	data.alph_size = 0;
+	memset(symbol_map_down_table, 0, max_symbols*sizeof(int));
+	int mask = max_symbols-1;
+	for(int i = 0; i < data.len; i++){ 
+		data.symbols[i] &= mask;
+		if(data.symbols[i] > data.maxsymbol) data.maxsymbol = data.symbols[i];
+		if(symbol_map_down_table[data.symbols[i]] == 0) symbol_map_down_table[data.symbols[i]] = 1;
+	}
+
+	for(int i = 0; i < max_symbols; i++){
+		if(symbol_map_down_table[i] != 0) symbol_map_down_table[i] = (byte)data.alph_size++;
+	}
+
+    ////////////////////////////////////////////////////
+	// create bsymbols (bitstring) using the non-mapped data
+	data.blen = data.len * data.word_size;
+	if(data.word_size == 1) data.bsymbols = data.symbols;
+	else{
+		data.bsymbols = (byte*)malloc(data.blen);
+		if(data.bsymbols == NULL){
+			throw std::runtime_error("failure to initialize memory for bsymbols!");
+			free_data(&data);
+		}
+
+		for(int i = 0; i < data.len; i++){
+			for(int j = 0; j < data.word_size; j++){
+				data.bsymbols[i*data.word_size+j] = (data.symbols[i] >> (data.word_size-1-j)) & 0x1;
+			}
+		}
+	}
+
+    ////////////////////////////////////////////////////
+	// map down symbols if less than 2^bits_per_word unique symbols
+	if(data.alph_size < data.maxsymbol + 1){
+		for(int i = 0; i < data.len; i++) data.symbols[i] = (byte)symbol_map_down_table[data.symbols[i]];
+	} 
+
+	return data;
+}
+
+
+#endif
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,38 @@
+use autocxx::prelude::*;
+use rand::Rng;
+
+include_cpp! {
+    //#include "shared/utils.h"
+    // #include "iid/chi_square_tests.h"
+    // #include "../../SP800-90B_EntropyAssessment/cpp/shared/utils.h"
+    // #include "nist.hpp"
+    #include "data.hpp"
+    // #include "bindings.hpp"
+    safety!(unsafe)
+    // generate!("chi_square_tests")
+    // generate!("construct_data_t")
+    // generate!("data_t")
+    generate!("Data")
+    // generate!("A")
+}
+
+fn main() {
+    println!("Hello, world!");
+
+    let mut rng = rand::thread_rng();
+    let samples: [i8; 1_000_000] = rng.gen();
+    let samples_length = samples.len() as i32;
+    let bits_per_symbol = 8;
+
+
+    let data = unsafe {
+        ffi::Data::new2(
+            samples.as_ptr(),
+            samples_length.into(),
+            bits_per_symbol.into(),
+            false,
+        )
+        .within_unique_ptr()
+    };
+    let a = data.iid_tests();
+}