Skip to content

Commit 9dad4bc

Browse files
committed
Big speedup
1 parent e3694d1 commit 9dad4bc

File tree

2 files changed

+60
-44
lines changed

2 files changed

+60
-44
lines changed

BUILD.bat

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
@REM cd "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64"
2-
@REM vcvarsx86_amd64.bat
3-
@REM cd <code path>
1+
@REM 64 bit:
2+
@REM call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat"
3+
@REM 32 bit:
4+
@REM call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\vcvars32.bat"
45

56
cl duplicates.cpp PMurHash.c /O2 /EHsc /GA /MT /FeDedup.exe

duplicates.cpp

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ SOFTWARE.
3131
#include "PMurHash.h"
3232

3333
#define BUFFERSIZE (64*1024*1024)
34+
#define VERSION "0.2"
3435

3536
using namespace std;
3637

@@ -42,9 +43,9 @@ class oneFile
4243
uint64_t timeStamp;
4344
};
4445

45-
map<uint64_t,map<uint32_t,list<oneFile>>> g_files; //Map of sizes,map of hashes, list of objects
46+
map<uint64_t,list<oneFile>> g_files; //Map of sizes,map of hashes, list of objects
4647
void RecurseFilePath(wstring path);
47-
uint32_t CalculateFileHash(const wstring& path, WIN32_FIND_DATAW& findData,uint64_t fileSize);
48+
uint32_t CalculateFileHash(const wstring& newPath, uint64_t fileSize);
4849
void checkDuplicates(bool deleteFiles,bool showDuplicates);
4950
bool AreDuplicates(const wstring& file1, const wstring& file2, uint64_t fileSize);
5051
vector<BYTE> g_buffer;
@@ -54,9 +55,10 @@ uint64_t g_filesProcessed=0;
5455

5556
int wmain(int argc, wchar_t* argv[])
5657
{
58+
DWORD timeTaken=GetTickCount();
5759
bool deleteFiles=false;
5860
bool showDuplicates=false;
59-
printf("Dedup (c) 2015 Logicore Software\n");
61+
printf("Dedup v%s (c) 2015 Logicore Software\n",VERSION);
6062
printf("www.logicore.se\n");
6163
printf("The software is provided as is. Use at your own risk.\n");
6264
if(argc<2)
@@ -82,10 +84,13 @@ int wmain(int argc, wchar_t* argv[])
8284
g_buffer2.resize(BUFFERSIZE);
8385

8486
wstring path=argv[1];
85-
printf("Hashing files...\n");
87+
printf("Scanning files...\n");
8688
RecurseFilePath(path);
87-
printf("%I64d Files hashed. Performing comparisons\n",g_filesProcessed);
89+
printf("%I64d Files found. Performing comparisons\n",g_filesProcessed);
8890
checkDuplicates(deleteFiles,showDuplicates);
91+
92+
timeTaken=GetTickCount()-timeTaken;
93+
printf("Time taken: %d seconds\n",timeTaken/1000);
8994
return 0;
9095
}
9196

@@ -96,45 +101,59 @@ void checkDuplicates(bool deleteFiles,bool showDuplicates)
96101

97102
for(auto& o : g_files) //Loop over file sizes
98103
{
99-
for(auto& o2 : o.second) //loop over hash values
104+
if(o.second.size()>1) //Size collision, need to hash & compare
100105
{
101-
bool stillOK=true;
102-
if(o2.second.size()>1 && stillOK)
106+
map<uint32_t,list<oneFile>> hashes;
107+
for(auto& o2 : o.second) //loop over hash values
103108
{
104-
stillOK=false;
105-
oneFile of=o2.second.front();
106-
// Figure out which file is oldest.
107-
for each(auto& o3 in o2.second)
108-
{
109-
if(o3.timeStamp<of.timeStamp)
110-
of=o3;
111-
}
112-
wstring fileName1=of.path+L"\\"+of.name;
113-
for(list<oneFile>::iterator o3=o2.second.begin();o3!=o2.second.end();)
109+
wstring fileName=o2.path+L"\\"+o2.name;
110+
uint32_t hash=CalculateFileHash(fileName,o.first);
111+
hashes[hash].push_back(o2);
112+
}
113+
114+
for(auto& o2 : hashes) //loop over hash values
115+
{
116+
bool stillOK=true;
117+
if(o2.second.size()>1 && stillOK)
114118
{
115-
if(!( (o3->path== of.path) && (o3->name== of.name) ))
119+
stillOK=false;
120+
oneFile of=o2.second.front();
121+
// Figure out which file is oldest.
122+
for each(auto& o3 in o2.second)
116123
{
117-
wstring fileName2=o3->path+L"\\"+o3->name;
118-
if(AreDuplicates(fileName1, fileName2,o.first))
124+
if(o3.timeStamp<of.timeStamp)
125+
of=o3;
126+
}
127+
wstring fileName1=of.path+L"\\"+of.name;
128+
for(list<oneFile>::iterator o3=o2.second.begin();o3!=o2.second.end();)
129+
{
130+
if(!( (o3->path== of.path) && (o3->name== of.name) ))
131+
{
132+
wstring fileName2=o3->path+L"\\"+o3->name;
133+
if(AreDuplicates(fileName1, fileName2,o.first))
134+
{
135+
if(showDuplicates)
136+
wprintf(L"%s is a duplicate of %s\n",fileName2.c_str(),fileName1.c_str());
137+
if(deleteFiles)
138+
{
139+
if(!DeleteFileW(fileName2.c_str()));
140+
wprintf(L"Could not delete %s\n",fileName2.c_str());
141+
}
142+
stillOK=true;
143+
o3=o2.second.erase(o3);
144+
duplicates++;
145+
bytesSaved+=o.first;
146+
continue; //to the loop
147+
}
148+
}
149+
else
119150
{
120-
if(showDuplicates)
121-
wprintf(L"%s is a duplicate of %s\n",fileName2.c_str(),fileName1.c_str());
122-
if(deleteFiles)
123-
DeleteFileW(fileName2.c_str());
124151
stillOK=true;
125152
o3=o2.second.erase(o3);
126-
duplicates++;
127-
bytesSaved+=o.first;
128-
continue; //to the loop
153+
continue;
129154
}
155+
o3++;
130156
}
131-
else
132-
{
133-
stillOK=true;
134-
o3=o2.second.erase(o3);
135-
continue;
136-
}
137-
o3++;
138157
}
139158
}
140159
}
@@ -187,16 +206,14 @@ void RecurseFilePath(wstring path)
187206
uint64_t fileSize=findData.nFileSizeHigh;
188207
fileSize=fileSize<<32;
189208
fileSize+=findData.nFileSizeLow;
190-
uint32_t hash=CalculateFileHash(path,findData,fileSize);
191-
192209

193210
o.timeStamp=findData.ftCreationTime.dwHighDateTime;
194211
o.timeStamp=o.timeStamp<<32;
195212
o.timeStamp+=findData.ftCreationTime.dwLowDateTime;
196213

197214
o.name=findData.cFileName;
198215
o.path=path;
199-
g_files[fileSize][hash].push_back(o);
216+
g_files[fileSize].push_back(o);
200217
g_filesProcessed++;
201218
}
202219
}while(true);
@@ -240,14 +257,12 @@ bool AreDuplicates(const wstring& file1, const wstring& file2, uint64_t fileSize
240257
return true;
241258
}
242259

243-
uint32_t CalculateFileHash(const wstring& path, WIN32_FIND_DATAW& findData,uint64_t fileSize)
260+
uint32_t CalculateFileHash(const wstring& newPath, uint64_t fileSize)
244261
{
245262
MH_UINT32 hash=0;
246263
MH_UINT32 carry=0;
247264
uint64_t totalSize=fileSize;
248265

249-
wstring newPath=path+L"\\";
250-
newPath+=findData.cFileName;
251266
FILE* fp=_wfopen(newPath.c_str(),L"rb");
252267
if(fp)
253268
{

0 commit comments

Comments
 (0)