Skip to content

Commit 1925afe

Browse files
committed
A command line tool for finding duplicate files and optionally removing them
Progressive MurmurHash2 implementation by Shane Day
1 parent ba7a97a commit 1925afe

File tree

4 files changed

+684
-0
lines changed

4 files changed

+684
-0
lines changed

LICENSE.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2015 Logicore Software
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

PMurHash.c

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
/*-----------------------------------------------------------------------------
2+
* MurmurHash3 was written by Austin Appleby, and is placed in the public
3+
* domain.
4+
*
5+
* This implementation was written by Shane Day, and is also public domain.
6+
*
7+
* This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A)
8+
* with support for progressive processing.
9+
*/
10+
11+
/*-----------------------------------------------------------------------------
12+
13+
If you want to understand the MurmurHash algorithm you would be much better
14+
off reading the original source. Just point your browser at:
15+
http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
16+
17+
18+
What this version provides?
19+
20+
1. Progressive data feeding. Useful when the entire payload to be hashed
21+
does not fit in memory or when the data is streamed through the application.
22+
Also useful when hashing a number of strings with a common prefix. A partial
23+
hash of a prefix string can be generated and reused for each suffix string.
24+
25+
2. Portability. Plain old C so that it should compile on any old compiler.
26+
Both CPU endian and access-alignment neutral, but avoiding inefficient code
27+
when possible depending on CPU capabilities.
28+
29+
3. Drop in. I personally like nice self contained public domain code, making it
30+
easy to pilfer without loads of refactoring to work properly in the existing
31+
application code & makefile structure and mucking around with licence files.
32+
Just copy PMurHash.h and PMurHash.c and you're ready to go.
33+
34+
35+
How does it work?
36+
37+
We can only process entire 32 bit chunks of input, except for the very end
38+
that may be shorter. So along with the partial hash we need to give back to
39+
the caller a carry containing up to 3 bytes that we were unable to process.
40+
This carry also needs to record the number of bytes the carry holds. I use
41+
the low 2 bits as a count (0..3) and the carry bytes are shifted into the
42+
high byte in stream order.
43+
44+
To handle endianess I simply use a macro that reads a uint32_t and define
45+
that macro to be a direct read on little endian machines, a read and swap
46+
on big endian machines, or a byte-by-byte read if the endianess is unknown.
47+
48+
-----------------------------------------------------------------------------*/
49+
50+
51+
#include "PMurHash.h"
52+
53+
/* I used ugly type names in the header to avoid potential conflicts with
54+
* application or system typedefs & defines. Since I'm not including any more
55+
* headers below here I can rename these so that the code reads like C99 */
56+
#undef uint32_t
57+
#define uint32_t MH_UINT32
58+
#undef uint8_t
59+
#define uint8_t MH_UINT8
60+
61+
/* MSVC warnings we choose to ignore */
62+
#if defined(_MSC_VER)
63+
#pragma warning(disable: 4127) /* conditional expression is constant */
64+
#endif
65+
66+
/*-----------------------------------------------------------------------------
67+
* Endianess, misalignment capabilities and util macros
68+
*
69+
* The following 3 macros are defined in this section. The other macros defined
70+
* are only needed to help derive these 3.
71+
*
72+
* READ_UINT32(x) Read a little endian unsigned 32-bit int
73+
* UNALIGNED_SAFE Defined if READ_UINT32 works on non-word boundaries
74+
* ROTL32(x,r) Rotate x left by r bits
75+
*/
76+
77+
/* Convention is to define __BYTE_ORDER == to one of these values */
78+
#if !defined(__BIG_ENDIAN)
79+
#define __BIG_ENDIAN 4321
80+
#endif
81+
#if !defined(__LITTLE_ENDIAN)
82+
#define __LITTLE_ENDIAN 1234
83+
#endif
84+
85+
/* I386 */
86+
#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(i386)
87+
#define __BYTE_ORDER __LITTLE_ENDIAN
88+
#define UNALIGNED_SAFE
89+
#endif
90+
91+
/* gcc 'may' define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ to 1 (Note the trailing __),
92+
* or even _LITTLE_ENDIAN or _BIG_ENDIAN (Note the single _ prefix) */
93+
#if !defined(__BYTE_ORDER)
94+
#if defined(__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__==1 || defined(_LITTLE_ENDIAN) && _LITTLE_ENDIAN==1
95+
#define __BYTE_ORDER __LITTLE_ENDIAN
96+
#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__==1 || defined(_BIG_ENDIAN) && _BIG_ENDIAN==1
97+
#define __BYTE_ORDER __BIG_ENDIAN
98+
#endif
99+
#endif
100+
101+
/* gcc (usually) defines xEL/EB macros for ARM and MIPS endianess */
102+
#if !defined(__BYTE_ORDER)
103+
#if defined(__ARMEL__) || defined(__MIPSEL__)
104+
#define __BYTE_ORDER __LITTLE_ENDIAN
105+
#endif
106+
#if defined(__ARMEB__) || defined(__MIPSEB__)
107+
#define __BYTE_ORDER __BIG_ENDIAN
108+
#endif
109+
#endif
110+
111+
/* Now find best way we can to READ_UINT32 */
112+
#if __BYTE_ORDER==__LITTLE_ENDIAN
113+
/* CPU endian matches murmurhash algorithm, so read 32-bit word directly */
114+
#define READ_UINT32(ptr) (*((uint32_t*)(ptr)))
115+
#elif __BYTE_ORDER==__BIG_ENDIAN
116+
/* TODO: Add additional cases below where a compiler provided bswap32 is available */
117+
#if defined(__GNUC__) && (__GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>=3))
118+
#define READ_UINT32(ptr) (__builtin_bswap32(*((uint32_t*)(ptr))))
119+
#else
120+
/* Without a known fast bswap32 we're just as well off doing this */
121+
#define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
122+
#define UNALIGNED_SAFE
123+
#endif
124+
#else
125+
/* Unknown endianess so last resort is to read individual bytes */
126+
#define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
127+
128+
/* Since we're not doing word-reads we can skip the messing about with realignment */
129+
#define UNALIGNED_SAFE
130+
#endif
131+
132+
/* Find best way to ROTL32 */
133+
#if defined(_MSC_VER)
134+
#include <stdlib.h> /* Microsoft put _rotl declaration in here */
135+
#define ROTL32(x,r) _rotl(x,r)
136+
#else
137+
/* gcc recognises this code and generates a rotate instruction for CPUs with one */
138+
#define ROTL32(x,r) (((uint32_t)x << r) | ((uint32_t)x >> (32 - r)))
139+
#endif
140+
141+
142+
/*-----------------------------------------------------------------------------
143+
* Core murmurhash algorithm macros */
144+
145+
#define C1 (0xcc9e2d51)
146+
#define C2 (0x1b873593)
147+
148+
/* This is the main processing body of the algorithm. It operates
149+
* on each full 32-bits of input. */
150+
#define DOBLOCK(h1, k1) do{ \
151+
k1 *= C1; \
152+
k1 = ROTL32(k1,15); \
153+
k1 *= C2; \
154+
\
155+
h1 ^= k1; \
156+
h1 = ROTL32(h1,13); \
157+
h1 = h1*5+0xe6546b64; \
158+
}while(0)
159+
160+
161+
/* Append unaligned bytes to carry, forcing hash churn if we have 4 bytes */
162+
/* cnt=bytes to process, h1=name of h1 var, c=carry, n=bytes in c, ptr/len=payload */
163+
#define DOBYTES(cnt, h1, c, n, ptr, len) do{ \
164+
int _i = cnt; \
165+
while(_i--) { \
166+
c = c>>8 | *ptr++<<24; \
167+
n++; len--; \
168+
if(n==4) { \
169+
DOBLOCK(h1, c); \
170+
n = 0; \
171+
} \
172+
} }while(0)
173+
174+
/*---------------------------------------------------------------------------*/
175+
176+
/* Main hashing function. Initialise carry to 0 and h1 to 0 or an initial seed
177+
* if wanted. Both ph1 and pcarry are required arguments. */
178+
void PMurHash32_Process(uint32_t *ph1, uint32_t *pcarry, const void *key, int len)
179+
{
180+
uint32_t h1 = *ph1;
181+
uint32_t c = *pcarry;
182+
183+
const uint8_t *ptr = (uint8_t*)key;
184+
const uint8_t *end;
185+
186+
/* Extract carry count from low 2 bits of c value */
187+
int n = c & 3;
188+
189+
#if defined(UNALIGNED_SAFE)
190+
/* This CPU handles unaligned word access */
191+
192+
/* Consume any carry bytes */
193+
int i = (4-n) & 3;
194+
if(i && i <= len) {
195+
DOBYTES(i, h1, c, n, ptr, len);
196+
}
197+
198+
/* Process 32-bit chunks */
199+
end = ptr + len/4*4;
200+
for( ; ptr < end ; ptr+=4) {
201+
uint32_t k1 = READ_UINT32(ptr);
202+
DOBLOCK(h1, k1);
203+
}
204+
205+
#else /*UNALIGNED_SAFE*/
206+
/* This CPU does not handle unaligned word access */
207+
208+
/* Consume enough so that the next data byte is word aligned */
209+
int i = -(long)ptr & 3;
210+
if(i && i <= len) {
211+
DOBYTES(i, h1, c, n, ptr, len);
212+
}
213+
214+
/* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */
215+
end = ptr + len/4*4;
216+
switch(n) { /* how many bytes in c */
217+
case 0: /* c=[----] w=[3210] b=[3210]=w c'=[----] */
218+
for( ; ptr < end ; ptr+=4) {
219+
uint32_t k1 = READ_UINT32(ptr);
220+
DOBLOCK(h1, k1);
221+
}
222+
break;
223+
case 1: /* c=[0---] w=[4321] b=[3210]=c>>24|w<<8 c'=[4---] */
224+
for( ; ptr < end ; ptr+=4) {
225+
uint32_t k1 = c>>24;
226+
c = READ_UINT32(ptr);
227+
k1 |= c<<8;
228+
DOBLOCK(h1, k1);
229+
}
230+
break;
231+
case 2: /* c=[10--] w=[5432] b=[3210]=c>>16|w<<16 c'=[54--] */
232+
for( ; ptr < end ; ptr+=4) {
233+
uint32_t k1 = c>>16;
234+
c = READ_UINT32(ptr);
235+
k1 |= c<<16;
236+
DOBLOCK(h1, k1);
237+
}
238+
break;
239+
case 3: /* c=[210-] w=[6543] b=[3210]=c>>8|w<<24 c'=[654-] */
240+
for( ; ptr < end ; ptr+=4) {
241+
uint32_t k1 = c>>8;
242+
c = READ_UINT32(ptr);
243+
k1 |= c<<24;
244+
DOBLOCK(h1, k1);
245+
}
246+
}
247+
#endif /*UNALIGNED_SAFE*/
248+
249+
/* Advance over whole 32-bit chunks, possibly leaving 1..3 bytes */
250+
len -= len/4*4;
251+
252+
/* Append any remaining bytes into carry */
253+
DOBYTES(len, h1, c, n, ptr, len);
254+
255+
/* Copy out new running hash and carry */
256+
*ph1 = h1;
257+
*pcarry = (c & ~0xff) | n;
258+
}
259+
260+
/*---------------------------------------------------------------------------*/
261+
262+
/* Finalize a hash. To match the original Murmur3A the total_length must be provided */
263+
uint32_t PMurHash32_Result(uint32_t h, uint32_t carry, uint32_t total_length)
264+
{
265+
uint32_t k1;
266+
int n = carry & 3;
267+
if(n) {
268+
k1 = carry >> (4-n)*8;
269+
k1 *= C1; k1 = ROTL32(k1,15); k1 *= C2; h ^= k1;
270+
}
271+
h ^= total_length;
272+
273+
/* fmix */
274+
h ^= h >> 16;
275+
h *= 0x85ebca6b;
276+
h ^= h >> 13;
277+
h *= 0xc2b2ae35;
278+
h ^= h >> 16;
279+
280+
return h;
281+
}
282+
283+
/*---------------------------------------------------------------------------*/
284+
285+
/* Murmur3A compatable all-at-once */
286+
uint32_t PMurHash32(uint32_t seed, const void *key, int len)
287+
{
288+
uint32_t h1=seed, carry=0;
289+
PMurHash32_Process(&h1, &carry, key, len);
290+
return PMurHash32_Result(h1, carry, len);
291+
}
292+
293+
/*---------------------------------------------------------------------------*/
294+
295+
/* Provide an API suitable for smhasher */
296+
void PMurHash32_test(const void *key, int len, uint32_t seed, void *out)
297+
{
298+
uint32_t h1=seed, carry=0;
299+
const uint8_t *ptr = (uint8_t*)key;
300+
const uint8_t *end = ptr + len;
301+
302+
#if 0 /* Exercise the progressive processing */
303+
while(ptr < end) {
304+
//const uint8_t *mid = ptr + rand()%(end-ptr)+1;
305+
const uint8_t *mid = ptr + (rand()&0xF);
306+
mid = mid<end?mid:end;
307+
PMurHash32_Process(&h1, &carry, ptr, mid-ptr);
308+
ptr = mid;
309+
}
310+
#else
311+
PMurHash32_Process(&h1, &carry, ptr, (int)(end-ptr));
312+
#endif
313+
h1 = PMurHash32_Result(h1, carry, len);
314+
*(uint32_t*)out = h1;
315+
}
316+
317+
/*---------------------------------------------------------------------------*/

0 commit comments

Comments
 (0)