Tidy up removing old files
This commit is contained in:
parent
b5db1b4e14
commit
04da891272
2 changed files with 0 additions and 155 deletions
Binary file not shown.
155
src/rice.erl
155
src/rice.erl
|
@ -1,155 +0,0 @@
|
||||||
-module(rice).
|
|
||||||
-export([encode/1,
|
|
||||||
encode/2,
|
|
||||||
checkforhash/2,
|
|
||||||
converttohash/1]).
|
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
|
||||||
|
|
||||||
%% Factor is the power of 2 representing the expected normal gap size between
|
|
||||||
%% members of the hash, and therefore the size of the bitstring to represent the
|
|
||||||
%% remainder for the gap
|
|
||||||
%%
|
|
||||||
%% The encoded output should contain a single byte which is the Factor, followed
|
|
||||||
%% by a series of exponents and remainders.
|
|
||||||
%%
|
|
||||||
%% The exponent is n 1's followed by a 0, where n * (2 ^ Factor) + remainder
|
|
||||||
%% represents the gap to the next hash
|
|
||||||
%%
|
|
||||||
%% The size passed in should be the maximum possible value of the hash.
|
|
||||||
%% If this isn't provided - assumes 2^32 - the default for phash2
|
|
||||||
|
|
||||||
encode(HashList) ->
|
|
||||||
encode(HashList, 4 * 1024 * 1024 * 1024).
|
|
||||||
|
|
||||||
encode(HashList, Size) ->
|
|
||||||
SortedHashList = lists:usort(HashList),
|
|
||||||
ExpectedGapSize = Size div length(SortedHashList),
|
|
||||||
Factor = findpowerundergap(ExpectedGapSize),
|
|
||||||
riceencode(SortedHashList, Factor).
|
|
||||||
|
|
||||||
%% Outcome may be suboptimal if lists have not been de-duplicated
|
|
||||||
%% Will fail on an unsorted list
|
|
||||||
|
|
||||||
riceencode(HashList, Factor) when Factor<256 ->
|
|
||||||
Divisor = powtwo(Factor),
|
|
||||||
riceencode(HashList, Factor, Divisor, <<>>, 0).
|
|
||||||
|
|
||||||
riceencode([], Factor, _, BitStrAcc, _) ->
|
|
||||||
Prefix = binary:encode_unsigned(Factor),
|
|
||||||
<<Prefix/bytes, BitStrAcc/bitstring>>;
|
|
||||||
riceencode([HeadHash|TailList], Factor, Divisor, BitStrAcc, LastHash) ->
|
|
||||||
HashGap = HeadHash - LastHash,
|
|
||||||
case HashGap of
|
|
||||||
0 ->
|
|
||||||
riceencode(TailList, Factor, Divisor, BitStrAcc, HeadHash);
|
|
||||||
N when N > 0 ->
|
|
||||||
Exponent = buildexponent(HashGap div Divisor),
|
|
||||||
Remainder = HashGap rem Divisor,
|
|
||||||
ExpandedBitStrAcc = <<BitStrAcc/bitstring, Exponent/bitstring, Remainder:Factor>>,
|
|
||||||
riceencode(TailList, Factor, Divisor, ExpandedBitStrAcc, HeadHash)
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
%% Checking for a hash needs to roll through the compressed bloom, decoding until
|
|
||||||
%% the member is found (match!), passed (not matched) or the end of the encoded
|
|
||||||
%% bitstring has been reached (not matched)
|
|
||||||
|
|
||||||
checkforhash(HashToCheck, BitStr) ->
|
|
||||||
<<Factor:8/integer, RiceEncodedBitStr/bitstring>> = BitStr,
|
|
||||||
Divisor = powtwo(Factor),
|
|
||||||
checkforhash(HashToCheck, RiceEncodedBitStr, Factor, Divisor, 0).
|
|
||||||
|
|
||||||
checkforhash(_, <<>>, _, _, _) ->
|
|
||||||
false;
|
|
||||||
checkforhash(HashToCheck, BitStr, Factor, Divisor, Acc) ->
|
|
||||||
[Exponent, BitStrTail] = findexponent(BitStr),
|
|
||||||
[Remainder, BitStrTail2] = findremainder(BitStrTail, Factor),
|
|
||||||
NextHash = Acc + Divisor * Exponent + Remainder,
|
|
||||||
case NextHash of
|
|
||||||
HashToCheck -> true;
|
|
||||||
N when N>HashToCheck -> false;
|
|
||||||
_ -> checkforhash(HashToCheck, BitStrTail2, Factor, Divisor, NextHash)
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
%% Exported functions - currently used only in testing
|
|
||||||
|
|
||||||
converttohash(ItemList) ->
|
|
||||||
converttohash(ItemList, []).
|
|
||||||
|
|
||||||
converttohash([], HashList) ->
|
|
||||||
HashList;
|
|
||||||
converttohash([H|T], HashList) ->
|
|
||||||
converttohash(T, [erlang:phash2(H)|HashList]).
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
%% Helper functions
|
|
||||||
|
|
||||||
buildexponent(Exponent) ->
|
|
||||||
buildexponent(Exponent, <<0:1>>).
|
|
||||||
|
|
||||||
buildexponent(0, OutputBits) ->
|
|
||||||
OutputBits;
|
|
||||||
buildexponent(Exponent, OutputBits) ->
|
|
||||||
buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
|
|
||||||
|
|
||||||
|
|
||||||
findexponent(BitStr) ->
|
|
||||||
findexponent(BitStr, 0).
|
|
||||||
|
|
||||||
findexponent(BitStr, Acc) ->
|
|
||||||
<<H:1/bitstring, T/bitstring>> = BitStr,
|
|
||||||
case H of
|
|
||||||
<<1:1>> -> findexponent(T, Acc + 1);
|
|
||||||
<<0:1>> -> [Acc, T]
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
findremainder(BitStr, Factor) ->
|
|
||||||
<<Remainder:Factor/integer, BitStrTail/bitstring>> = BitStr,
|
|
||||||
[Remainder, BitStrTail].
|
|
||||||
|
|
||||||
|
|
||||||
powtwo(N) -> powtwo(N, 1).
|
|
||||||
|
|
||||||
powtwo(0, Acc) ->
|
|
||||||
Acc;
|
|
||||||
powtwo(N, Acc) ->
|
|
||||||
powtwo(N-1, Acc * 2).
|
|
||||||
|
|
||||||
%% Helper method for finding the factor of two which provides the most
|
|
||||||
%% efficient compression given an average gap size
|
|
||||||
|
|
||||||
findpowerundergap(GapSize) -> findpowerundergap(GapSize, 1, 0).
|
|
||||||
|
|
||||||
findpowerundergap(GapSize, Acc, Counter) ->
|
|
||||||
case Acc of
|
|
||||||
N when N > GapSize -> Counter - 1;
|
|
||||||
_ -> findpowerundergap(GapSize, Acc * 2, Counter + 1)
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
%% Unit tests
|
|
||||||
|
|
||||||
findpowerundergap_test_() ->
|
|
||||||
[
|
|
||||||
?_assertEqual(9, findpowerundergap(700)),
|
|
||||||
?_assertEqual(9, findpowerundergap(512)),
|
|
||||||
?_assertEqual(8, findpowerundergap(511))].
|
|
||||||
|
|
||||||
encode_test_() ->
|
|
||||||
[
|
|
||||||
?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924], 1024)),
|
|
||||||
?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,24,924], 1024)),
|
|
||||||
?_assertEqual(<<9, 6, 44, 4:5>>, encode([24,924,924], 1024))
|
|
||||||
].
|
|
||||||
|
|
||||||
check_test_() ->
|
|
||||||
[
|
|
||||||
?_assertEqual(true, checkforhash(924, <<9, 6, 44, 4:5>>)),
|
|
||||||
?_assertEqual(true, checkforhash(24, <<9, 6, 44, 4:5>>)),
|
|
||||||
?_assertEqual(false, checkforhash(23, <<9, 6, 44, 4:5>>)),
|
|
||||||
?_assertEqual(false, checkforhash(923, <<9, 6, 44, 4:5>>)),
|
|
||||||
?_assertEqual(false, checkforhash(925, <<9, 6, 44, 4:5>>))
|
|
||||||
].
|
|
Loading…
Add table
Add a link
Reference in a new issue