@@ -31,6 +31,7 @@ SOFTWARE.
3131#include " PMurHash.h"
3232
3333#define BUFFERSIZE (64 *1024 *1024 )
34+ #define VERSION " 0.2"
3435
3536using namespace std ;
3637
@@ -42,9 +43,9 @@ class oneFile
4243 uint64_t timeStamp;
4344};
4445
45- map<uint64_t ,map< uint32_t , list<oneFile> >> g_files; // Map of sizes,map of hashes, list of objects
46+ map<uint64_t ,list<oneFile>> g_files; // Map of sizes,map of hashes, list of objects
4647void RecurseFilePath (wstring path);
47- uint32_t CalculateFileHash (const wstring& path, WIN32_FIND_DATAW& findData, uint64_t fileSize);
48+ uint32_t CalculateFileHash (const wstring& newPath, uint64_t fileSize);
4849void checkDuplicates (bool deleteFiles,bool showDuplicates);
4950bool AreDuplicates (const wstring& file1, const wstring& file2, uint64_t fileSize);
5051vector<BYTE> g_buffer;
@@ -54,9 +55,10 @@ uint64_t g_filesProcessed=0;
5455
5556int wmain (int argc, wchar_t * argv[])
5657{
58+ DWORD timeTaken=GetTickCount ();
5759 bool deleteFiles=false ;
5860 bool showDuplicates=false ;
59- printf (" Dedup (c) 2015 Logicore Software\n " );
61+ printf (" Dedup v%s (c) 2015 Logicore Software\n " ,VERSION );
6062 printf (" www.logicore.se\n " );
6163 printf (" The software is provided as is. Use at your own risk.\n " );
6264 if (argc<2 )
@@ -82,10 +84,13 @@ int wmain(int argc, wchar_t* argv[])
8284 g_buffer2.resize (BUFFERSIZE);
8385
8486 wstring path=argv[1 ];
85- printf (" Hashing files...\n " );
87+ printf (" Scanning files...\n " );
8688 RecurseFilePath (path);
87- printf (" %I64d Files hashed . Performing comparisons\n " ,g_filesProcessed);
89+ printf (" %I64d Files found . Performing comparisons\n " ,g_filesProcessed);
8890 checkDuplicates (deleteFiles,showDuplicates);
91+
92+ timeTaken=GetTickCount ()-timeTaken;
93+ printf (" Time taken: %d seconds\n " ,timeTaken/1000 );
8994 return 0 ;
9095}
9196
@@ -96,45 +101,59 @@ void checkDuplicates(bool deleteFiles,bool showDuplicates)
96101
97102 for (auto & o : g_files) // Loop over file sizes
98103 {
99- for ( auto & o2 : o.second ) // loop over hash values
104+ if ( o.second . size ()> 1 ) // Size collision, need to hash & compare
100105 {
101- bool stillOK= true ;
102- if (o2 .second . size ()> 1 && stillOK)
106+ map< uint32_t ,list<oneFile>> hashes ;
107+ for ( auto & o2 : o .second ) // loop over hash values
103108 {
104- stillOK=false ;
105- oneFile of=o2.second .front ();
106- // Figure out which file is oldest.
107- for each (auto & o3 in o2.second )
108- {
109- if (o3.timeStamp <of.timeStamp )
110- of=o3;
111- }
112- wstring fileName1=of.path +L" \\ " +of.name ;
113- for (list<oneFile>::iterator o3=o2.second .begin ();o3!=o2.second .end ();)
109+ wstring fileName=o2.path +L" \\ " +o2.name ;
110+ uint32_t hash=CalculateFileHash (fileName,o.first );
111+ hashes[hash].push_back (o2);
112+ }
113+
114+ for (auto & o2 : hashes) // loop over hash values
115+ {
116+ bool stillOK=true ;
117+ if (o2.second .size ()>1 && stillOK)
114118 {
115- if (!( (o3->path == of.path ) && (o3->name == of.name ) ))
119+ stillOK=false ;
120+ oneFile of=o2.second .front ();
121+ // Figure out which file is oldest.
122+ for each (auto & o3 in o2.second )
116123 {
117- wstring fileName2=o3->path +L" \\ " +o3->name ;
118- if (AreDuplicates (fileName1, fileName2,o.first ))
124+ if (o3.timeStamp <of.timeStamp )
125+ of=o3;
126+ }
127+ wstring fileName1=of.path +L" \\ " +of.name ;
128+ for (list<oneFile>::iterator o3=o2.second .begin ();o3!=o2.second .end ();)
129+ {
130+ if (!( (o3->path == of.path ) && (o3->name == of.name ) ))
131+ {
132+ wstring fileName2=o3->path +L" \\ " +o3->name ;
133+ if (AreDuplicates (fileName1, fileName2,o.first ))
134+ {
135+ if (showDuplicates)
136+ wprintf (L" %s is a duplicate of %s\n " ,fileName2.c_str (),fileName1.c_str ());
137+ if (deleteFiles)
138+ {
139+ if (!DeleteFileW (fileName2.c_str ()));
140+ wprintf (L" Could not delete %s\n " ,fileName2.c_str ());
141+ }
142+ stillOK=true ;
143+ o3=o2.second .erase (o3);
144+ duplicates++;
145+ bytesSaved+=o.first ;
146+ continue ; // to the loop
147+ }
148+ }
149+ else
119150 {
120- if (showDuplicates)
121- wprintf (L" %s is a duplicate of %s\n " ,fileName2.c_str (),fileName1.c_str ());
122- if (deleteFiles)
123- DeleteFileW (fileName2.c_str ());
124151 stillOK=true ;
125152 o3=o2.second .erase (o3);
126- duplicates++;
127- bytesSaved+=o.first ;
128- continue ; // to the loop
153+ continue ;
129154 }
155+ o3++;
130156 }
131- else
132- {
133- stillOK=true ;
134- o3=o2.second .erase (o3);
135- continue ;
136- }
137- o3++;
138157 }
139158 }
140159 }
@@ -187,16 +206,14 @@ void RecurseFilePath(wstring path)
187206 uint64_t fileSize=findData.nFileSizeHigh ;
188207 fileSize=fileSize<<32 ;
189208 fileSize+=findData.nFileSizeLow ;
190- uint32_t hash=CalculateFileHash (path,findData,fileSize);
191-
192209
193210 o.timeStamp =findData.ftCreationTime .dwHighDateTime ;
194211 o.timeStamp =o.timeStamp <<32 ;
195212 o.timeStamp +=findData.ftCreationTime .dwLowDateTime ;
196213
197214 o.name =findData.cFileName ;
198215 o.path =path;
199- g_files[fileSize][hash] .push_back (o);
216+ g_files[fileSize].push_back (o);
200217 g_filesProcessed++;
201218 }
202219 }while (true );
@@ -240,14 +257,12 @@ bool AreDuplicates(const wstring& file1, const wstring& file2, uint64_t fileSize
240257 return true ;
241258}
242259
243- uint32_t CalculateFileHash (const wstring& path, WIN32_FIND_DATAW& findData, uint64_t fileSize)
260+ uint32_t CalculateFileHash (const wstring& newPath, uint64_t fileSize)
244261{
245262 MH_UINT32 hash=0 ;
246263 MH_UINT32 carry=0 ;
247264 uint64_t totalSize=fileSize;
248265
249- wstring newPath=path+L" \\ " ;
250- newPath+=findData.cFileName ;
251266 FILE* fp=_wfopen (newPath.c_str (),L" rb" );
252267 if (fp)
253268 {
0 commit comments