3 Коммиты 7e4caafab3 ... a2004b93a0

Автор SHA1 Сообщение Дата
  namark a2004b93a0 huffman 6 месяцев назад
  namark f2613edc47 Fixed bit output iterator flushing logic, 6 месяцев назад
  namark 9d9a0d9b90 Removed pointless debug log. 6 месяцев назад

+ 66 - 5
source/simple/compress/bits.hpp

@@ -1,12 +1,12 @@
 #ifndef SIMPLE_COMPRESS_BITS_HPP
 #define SIMPLE_COMPRESS_BITS_HPP
 
-#include <type_traits> // std::enable_if_t std::is_integral_v std::is_same_v std::make_unsigned_t std::underlying_type_t
+#include <type_traits> // std::enable_if_t std::is_integral_v std::is_same_v std::make_unsigned_t std::underlying_type_t std::is_unsigned_v
 #include <cstddef> // std::size_t
 #include <limits> // std::numeric_limits
 #include <iterator> // std::iterator_traits
 #include <algorithm> // std::min
-#include <utility> // std::forward
+#include <string> // std::string
 
 #include "simple/support/type_traits.hpp" // support::remove_cvref_t
 
@@ -57,10 +57,10 @@ namespace simple::compress
 	{
 		auto count = bit_count(to);
 		auto type_offset = bit_offset(to);
-		auto bits = get_bits(to);
+		auto bits = decltype(get_bits(to)){};
 
 		constexpr auto type_width = std::numeric_limits<decltype(bits)>::digits;
-		constexpr auto read_width = std::numeric_limits<std::underlying_type_t<typename std::iterator_traits<simple::support::remove_cvref_t<It>>::value_type>>::digits;
+		constexpr auto read_width = std::numeric_limits<std::underlying_type_t<typename std::iterator_traits<support::remove_cvref_t<It>>::value_type>>::digits;
 
 		if constexpr (type_width >= read_width)
 		{
@@ -93,7 +93,7 @@ namespace simple::compress
 			static_assert(type_width < read_width, "not implemented");
 		}
 
-		to = static_cast<T>(bits);
+		to = bits;
 		return it;
 	}
 
@@ -104,6 +104,67 @@ namespace simple::compress
 		return read_bits(bit_iterator{it,0}, to);
 	}
 
+	template <typename T = unsigned, std::enable_if_t<std::is_unsigned_v<T>>* = nullptr>
+	struct bits
+	{
+		T value{};
+		std::size_t count{};
+
+		using value_type = T;
+		static constexpr std::size_t capacity = std::numeric_limits<value_type>::digits;
+
+		constexpr bool insert(bool bit) noexcept
+		{
+			if(count != capacity)
+			{
+				++count;
+				value >>= 1;
+				value |= value_type{bit} << (capacity - 1);
+				return true;
+			}
+			else
+				return false;
+		}
+
+		constexpr bits& operator=(const T& x)
+		{
+			// TODO: assert x has no junk outside of count
+			value = x;
+			return *this;
+		}
+
+		constexpr bool operator==(const bits& other) const
+		{ return value == other.value && count == other.count; }
+		constexpr bool operator!=(const bits& other) const
+		{ return not ((*this) == other); }
+	};
+
+	template <typename T>
+	constexpr auto get_bits(const bits<T>& b) noexcept
+	{ return b.value; }
+
+	template <typename T>
+	constexpr auto bit_count(const bits<T>& b) noexcept
+	{ return b.count; }
+
+	template <typename T>
+	constexpr std::size_t bit_offset(const bits<T>&) noexcept
+	{ return 0; }
+
+	template <typename T>
+	std::string to_string(const bits<T>& b) noexcept
+	{
+		std::string str{};
+		auto count = bit_count(b);
+		auto bits = get_bits(b);
+		constexpr decltype(bits) first_bit = decltype(bits){1} << (std::numeric_limits<decltype(bits)>::digits - 1);
+		while(count --> 0)
+		{
+			str += (bits & first_bit) ? '1' : '0';
+			bits <<= 1;
+		}
+		return str;
+	}
 
 } // namespace simple::compress
 

+ 2 - 2
source/simple/compress/hash_table.hpp

@@ -9,12 +9,12 @@
 namespace simple::compress
 {
 
-	// why make own map? cause std devour all memory
+	// why make own map? cause std devour all memory and never give it back
 	template <typename Key, typename Value, std::size_t row_count>
 	class hash_table
 	{
 
-		// TODO; use ecs to make these contiguous, the buckets often end up very small
+		// TODO; make these contiguous, the buckets often end up very small
 		std::vector<std::vector<Value>> table{row_count};
 		std::size_t size_{0};
 

+ 261 - 0
source/simple/compress/huffman.hpp

@@ -0,0 +1,261 @@
+#ifndef SIMPLE_COMPRESS_HUFFMAN_HPP
+#define SIMPLE_COMPRESS_HUFFMAN_HPP
+
+#include <iterator> // std::iterator_traits
+#include <cstddef> // std::size_t
+#include <array> // std::array
+#include <vector> // std::vector
+#include <utility> // std::pair std::move
+#include <limits> // std::numeric_limits
+#include <type_traits> // std::enable_if_t std::underlying_type_t std::make_unsigned_t std::is_unsigned_v std::conditional_t std::is_integral_v
+#include <cassert> // assert
+#include <algorithm> // std::transform std::swap
+
+#include "simple/support/type_traits.hpp" // support::is_template_instance_v
+
+#include "hash_table.hpp" // hash_table
+#include "bits.hpp" // bit_count bit_offset get_bits bits read_bits
+
+namespace simple::compress
+{
+
+	// oof, so painful we still don't have this in std, especially with all the junk that went through over the years
+	template <typename T, std::size_t Capacity>
+	class static_vector
+	{
+		std::array<T, Capacity> array;
+		typename std::array<T, Capacity>::iterator next;
+
+		public:
+
+		constexpr static_vector() : array(), next(array.begin()) {};
+
+		constexpr auto begin() { return array.begin(); }
+		constexpr auto begin() const { return array.begin(); }
+		constexpr auto end() { return next; }
+		constexpr auto end() const { return next; }
+
+		constexpr bool empty() const
+		{
+			return next == array.begin();
+		}
+
+		constexpr bool full() const
+		{
+			return next == array.end();
+		}
+
+		constexpr void push_back(T element)
+		{
+			*next++ = std::move(element);
+		}
+
+		constexpr void pop_back() { --next; }
+
+		constexpr auto& back() { return *next; }
+		constexpr auto& back() const { return *next; }
+
+	};
+
+	template <typename SmallKey, typename Value, typename Enabled = void>
+	class small_table;
+
+	template <typename SmallKey, typename Value>
+	class small_table<SmallKey, Value, std::enable_if_t<bit_count(SmallKey{}) <= 13 && std::is_unsigned_v<SmallKey>>>
+	{
+		std::array<Value, 1 << bit_count(SmallKey{})> table{};
+
+		static constexpr auto get_bit_value(const SmallKey& key)
+		{
+			constexpr auto bc = bit_count(SmallKey{});
+			constexpr auto bo = bit_offset(SmallKey{});
+			auto value = get_bits(key);
+			constexpr auto tc = std::numeric_limits<decltype(value)>::digits;
+			value >>= tc - (bo + bc);
+			return value;
+		}
+
+		template <typename Self>
+		static constexpr auto& get(Self& self, const SmallKey& key)
+		{
+			auto offset = get_bit_value(key);
+			assert(offset < self.table.size());
+			return self.table[offset];
+		}
+
+		public:
+
+		constexpr Value& operator[](const SmallKey& key)
+		{ return get(*this, key); }
+
+		constexpr const Value& operator[](const SmallKey& key) const
+		{ return get(*this, key); }
+
+		template <typename F>
+		constexpr void for_each(F&& f) const
+		{
+			constexpr auto bc = bit_count(SmallKey{});
+			constexpr auto bo = bit_offset(SmallKey{});
+			using key_value = decltype(get_bits(SmallKey{}));
+			constexpr auto tc = std::numeric_limits<key_value>::digits;
+			for(std::size_t i = 0; i != table.size(); ++i)
+			{
+				auto key = static_cast<key_value>(i);
+				key <<= tc - bc - bo;
+				f(std::pair{static_cast<SmallKey>(key), table[i]});
+			}
+		}
+
+		template <typename F>
+		// TODO: oof, need proper iterators, also this is used to read codes, and that's slow, prolly better walk the tree bit by bit
+		constexpr void find_if(F&& f) const
+		{
+			constexpr auto bc = bit_count(SmallKey{});
+			constexpr auto bo = bit_offset(SmallKey{});
+			using key_value = decltype(get_bits(SmallKey{}));
+			constexpr auto tc = std::numeric_limits<key_value>::digits;
+			for(std::size_t i = 0; i != table.size(); ++i)
+			{
+				auto key = static_cast<key_value>(i);
+				key <<= tc - bc - bo;
+				if(f(std::pair{static_cast<SmallKey>(key), table[i]}))
+					break;
+			}
+		}
+
+	};
+
+	template <typename BigKey, typename Value>
+	class small_table<BigKey, Value, std::enable_if_t<(bit_count(BigKey{}) > 13)>>
+	{
+		hash_table<BigKey, std::pair<BigKey, Value>, (1<<16)> table;
+		// TODO
+	};
+
+	template <typename T, typename Enabled = void>
+	struct underlying_type;
+	template <typename T>
+	struct underlying_type<T, std::enable_if_t<std::is_integral_v<T>>>
+	{ using type = T; };
+	template <typename T>
+	struct underlying_type<T, std::enable_if_t<not std::is_integral_v<T>>>
+	{ using type = std::underlying_type_t<T>; };
+	template <typename T>
+	using underlying_type_t = typename underlying_type<T>::type;
+
+	template <typename It>
+	[[nodiscard]] constexpr auto huffman_code(It begin, It end)
+	{
+		using key_type = std::make_unsigned_t<underlying_type_t<typename std::iterator_traits<It>::value_type>>;
+		small_table<key_type, std::size_t> counter{};
+		small_table<key_type, bits<>> code{}; // TODO smaller key, smaller bits
+		std::conditional_t<bit_count(key_type{}) <= 13,
+			static_vector<std::pair<key_type,key_type>, (1 << bit_count(key_type{}))>,
+			std::vector<std::pair<key_type,key_type>>
+		> hierarchy{};
+		for(auto i = begin; i != end; ++i)
+			++counter[*i];
+
+		std::array<std::pair<key_type, std::size_t>, 2> minmin;
+		while(true)
+		{
+			minmin = decltype(minmin){};
+			// NOTE: this could be partial_sort_copy(no_zeros(counter), minmin, second_less), but can't be bothered to write the necessary iterators atm
+			counter.for_each([&minmin](auto kv)
+			{
+				// filter
+				if(kv.second != 0)
+				{
+					// find a smaller value
+					if(minmin[1].second == 0 || kv.second < minmin[1].second)
+					{
+						minmin[1] = kv;
+						// keep em sorted
+						if(minmin[0].second == 0 || minmin[1].second < minmin[0].second)
+							std::swap(minmin[0], minmin[1]);
+					}
+				}
+			});
+
+			if(0 == minmin[1].second)
+				break;
+
+			// FIXME handle insert fail
+			code[minmin[0].first].insert(0);
+			code[minmin[1].first].insert(1);
+			for(auto& [symbol, parent] : hierarchy)
+			{
+				if(parent == minmin[0].first)
+				{
+					code[symbol].insert(0);
+					parent = minmin[0].first;
+				}
+				if(parent == minmin[1].first)
+				{
+					code[symbol].insert(1);
+					parent = minmin[0].first;
+				}
+
+			}
+
+			counter[minmin[0].first] += counter[minmin[1].first];
+			counter[minmin[1].first] = 0;
+
+			hierarchy.push_back({minmin[1].first, minmin[0].first});
+		}
+
+		// special case: there is only one symbol
+		if(0 != minmin[0].second && hierarchy.empty())
+			code[minmin[0].first].insert(0);
+
+		return code;
+	}
+
+	template <typename It, typename Out, typename Code>
+	constexpr auto huffman_encode(const Code& code, It begin, It end, Out out)
+	{
+		// NOTE: almost not worth a function, especially if add call operator to code...
+		// like this
+		// return std::transform(begin, end, out, std::ref(code));
+		// ref cause can't trust std to not copy galore, so maybe still some point to it, even in that form
+		return std::transform(begin, end, std::move(out), [&code](auto&& x) { return code[x]; });
+	}
+
+	template <typename Code, typename I, typename O,
+		std::enable_if_t<support::is_template_instance_v<bit_iterator, I>>* = nullptr
+	>
+	constexpr auto huffman_decode(const Code& code, I i, O out, O out_end)
+	{
+		while(out != out_end)
+		{
+			code.find_if([&i, &out](auto&& kv)
+			{
+				if(bit_count(kv.second) != 0)
+				{
+					auto read = kv.second;
+					auto next = read_bits(i, read);
+					if(read == kv.second)
+					{
+						*out = kv.first;
+						i = next;
+						return true;
+					}
+				}
+				return false;
+			});
+			++out;
+		}
+		return i;
+	}
+
+	template <typename Code, typename I, typename O,
+		std::enable_if_t<not support::is_template_instance_v<bit_iterator, I>>* = nullptr
+	>
+	constexpr auto huffman_decode(const Code& code, I i, O out_begin, O out_end)
+	{
+		return huffman_decode(code, bit_iterator{i,0}, out_begin, out_end);
+	}
+
+} // namespace simple::compress
+
+#endif /* end of include guard */

+ 14 - 2
source/simple/compress/iterator.hpp

@@ -35,6 +35,18 @@ namespace simple::compress
 			value{}, bit_index{0}
 		{}
 
+		out_bits(out_bits&) = delete;
+		out_bits(out_bits&& other) : out(other.out), value(other.value), bit_index(other.bit_index)
+		{ other.bit_index = 0; }
+		out_bits& operator=(out_bits&) = delete;
+		out_bits& operator=(out_bits&& other)
+		{
+			out = other.out;
+			value = other.value;
+			bit_index = other.bit_index;
+			other.bit_index = 0;
+		}
+
 		out_bits& operator++() { return *this; }
 		out_bits operator++(int) { out_bits prev = *this; ++(*this); return prev; }
 
@@ -108,8 +120,8 @@ namespace simple::compress
 
 		~out_bits()
 		{
-			// TODO flush
-			*out = value;
+			if(bit_index != 0)
+				*out = value;
 		}
 	};
 

+ 58 - 0
unit_tests/huffman.cpp

@@ -0,0 +1,58 @@
+// #include "simple/support/debug.hpp"
+
+#include "simple/compress/huffman.hpp"
+#include "simple/compress/iterator.hpp" // out_bits
+#include "simple/support/iterator.hpp" // offset_expander
+
+#include <cassert>
+#include <vector>
+#include <string>
+#include <cstdio>
+
+using namespace simple::compress;
+
+void Endecode(std::string text)
+{
+	std::vector<std::byte> encoded;
+	encoded.reserve(text.size());
+
+	auto code = huffman_code(text.begin(), text.end());
+
+#if defined SIMPLE_SUPPORT_DEBUG_HPP
+		simple::support::print('\n');
+		simple::support::println("CODE: ");
+		code.for_each([](auto && kv) { using std::to_string; if(bit_count(kv.second) != 0) simple::support::println(to_string((int)kv.first) + " - " + to_string(kv.second)); });
+		simple::support::print('\n');
+#endif
+
+	huffman_encode(code, text.begin(), text.end(), out_bits(simple::support::offset_expander(encoded)));
+
+#if defined SIMPLE_SUPPORT_DEBUG_HPP
+	simple::support::print("INPUT SIZE: ", text.size(), '\n');
+	simple::support::print("COMPRESSED SIZE: ", encoded.size(), '\n');
+#endif
+
+	std::string decoded;
+	decoded.resize(text.size());
+	huffman_decode(code, encoded.begin(), decoded.begin(), decoded.end());
+
+	assert(text == decoded);
+}
+
+int main(int argc, char const* argv[])
+{
+	std::string text = "abcd aaaa bbbb cccc aaaa abcd aaaa aaaa aaaa aaaaa aaaa aaaaaaaaaaa aaaaaaaaaaaaa aaaaaaaaaaaaaa aaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaa";
+	if(argc > 1)
+	{
+		auto f = std::fopen(argv[1], "rb");
+		std::fseek(f,0,SEEK_END);
+		text.resize(std::ftell(f));
+		std::fseek(f,0,SEEK_SET);
+		auto unused [[maybe_unused]] = std::fread(text.data(), text.size(), 1 ,f);
+#if defined SIMPLE_SUPPORT_DEBUG_HPP
+		simple::support::print("s: ", text.size(), '\n');
+#endif
+	}
+	Endecode(std::move(text));
+	return 0;
+}

+ 0 - 4
unit_tests/lz77.cpp

@@ -31,10 +31,6 @@ void Endecode(std::string text)
 	decoded.resize(text.size());
 	lz77_decode(encoded.begin(), decoded.begin(), decoded.end());
 
-#if defined SIMPLE_SUPPORT_DEBUG_HPP
-	simple::support::print("DE-COMPRESSED SIZE: ", decoded.size(), '\n');
-#endif
-
 	assert(text == decoded);
 }