-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cpp
More file actions
145 lines (128 loc) · 6.03 KB
/
main.cpp
File metadata and controls
145 lines (128 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
//
// main.cpp
// Tendency
//
// Created by Suneeth Torke on 10/21/20.
//
#include <stdio.h>
#include "base.hpp"
#include <iomanip>
using namespace std;
const bool OUTPUT = true;
const int SOURCE_COUNT = 85;
// TODO: change 'DOCUMENT' to change which document is being analyzed
// The DISPUTED ARTICLES ARE [49, 50, 51, 52, 53, 54, 55, 56, 57]
const int DISPUTED_ESSAY = 50;
// TODO: change the following values, to manipulate the Feature Depth, Interval and number of runs
// maximum feature count
const int MAX_DEPTH = 1000;
const int INTERVAL = 10;
// number of averaged sample runs
const int RUNS = MAX_DEPTH / INTERVAL;
const vector<string> CANDIDATES {{"Alexander Hamilton"}, {"James Madison"},
{"John Jay"}, {"Disputed Authorship"}};
// strips each essay of punctuation and lowecases all characters
void sanitize(string &str) {
replace(str.begin(), str.end(), '-', ' ');
replace(str.begin(), str.end(), '/', ' ');
replace(str.begin(), str.end(), '\n', ' ');
string j = ".-?!/$^~<>=+\":;{}(),";
for (char i: j) str.erase(std::remove(str.begin(), str.end(), i), str.end());
transform(begin(str), end(str), begin(str), [](char c){return std::tolower(c);});
}
// reads every essay in the provided folder and credits it to the correct author
// output a vectors of authors, whose elements contain each authors subcorpus of the essays
vector<pair<string,string>> prep_data() {
vector<string> files(SOURCE_COUNT);
// holds the relevant essays seperated by author, doesnt include the disputed essays
for (const auto & entry :__fs::filesystem::directory_iterator("/Users/torke/Desktop/essays")) files.push_back(entry.path());
const vector<vector<int>> essays {{1, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 59, 60, 61, 65, 66, 67, 68, 69, 70,
71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85}, {10, 18, 19, 20, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 58}, {2, 3, 4, 5, 64}, {DISPUTED_ESSAY}};
vector<pair<string,string>> sources(CANDIDATES.size());
cout << "Prepped Files:" << endl;
for (int a = 0; a < CANDIDATES.size(); ++a) sources[a] = {CANDIDATES[a], ""};
for (int i = 0; i < SOURCE_COUNT; ++i) {
string filename;
if (i < 9) filename = "essay0" + to_string(i + 1) + ".txt";
else filename = "essay" + to_string(i + 1) + ".txt";
std::ifstream in(filename);
std::stringstream buffer;
buffer << in.rdbuf();
std::string essay = buffer.str();
sanitize(essay);
for (size_t j = 0; j < sources.size(); ++j) {
// if the essay number, its adds it to the relevant author's text string
const auto source_iter = std::find(begin(essays[j]), end(essays[j]), i + 1);
if (source_iter != end(essays[j])) {
if (OUTPUT) cout << filename + " -> " + sources[j].first << endl;
sources[j].second = sources[j].second + " " + essay;
}
}
}
cout << "Analysing Document: {" + to_string(DISPUTED_ESSAY) + "}" << endl;
return sources;
}
// prints the run report after a completed run and identified the possible author
const void generate_output (const vector<pair<string,string>> &authors, const vector<double> &deltas) {
cout << endl;
cout << " |The following Results were calculated by slicing every: " + to_string(INTERVAL) + "th possible <fDepth> and" << endl;
cout << " |averaging all subcomponents in " + to_string(RUNS) + " runs to calculate the Mixed Depth Delta Score." << endl << endl;
for (int l = 0; l < deltas.size(); ++l) {
cout << " The MDD Score for [" + authors[l].first + "]: " + to_string(deltas[l] / RUNS) << endl;
}
cout << endl;
// finds the lowest author's delta score
double min = *min_element(begin(deltas), end(deltas));
cout << " |Based on these findings ESSAY " + to_string(DISPUTED_ESSAY) + " was" << endl;
cout << " |most likely written by: ";
for (size_t i = 0; i < deltas.size(); ++i) if (min == deltas[i]) cout << CANDIDATES[i] << endl << endl;
printf("\x1b[31m Hello World!\n");
}
// creates a frequency table of ALL POSSIBLE features
vector<pair<string,int>> generate_features(string &body) {
vector<pair<string, int>> corpus;
string word;
vector<string> list;
stringstream ss(body);
// splits the string into a vector of strings
while (ss >> word) list.emplace_back(word);
size_t word_count = list.size();
// puts duplicate elements size by side
sort(begin(list), end(list));
string root = list.front();
int dupes = 0;
// calculates the frequency of every word by counting duplicate neighbors
for (size_t i = 0; i < word_count; ++i) {
if (root == list[i]) ++dupes;
else {
corpus.push_back({root, dupes});
root = list[i];
dupes = 1;
}
}
// sorts in decending order
sort(begin(corpus), end(corpus), compare());
cout << "Created Corpus:" << endl;
return corpus;
}
int main(int argc, const char * argv[]) {
string cumulative;
// preps and generates all of the full texts ONE time
const vector<pair<string,string>> authors = prep_data();
for (int i = 0; i < authors.size(); ++i) cumulative += authors[i].second;
// generates a completed frequency table of features
const vector<pair<string,int>> features = generate_features(cumulative);
vector<double> deltas (authors.size() - 1);
for (int i = 1; i < RUNS + 1; ++i) {
// resizes the feature amount to the prechosen INTERVAl for this run
vector<pair<string,int>> mirror = features;
mirror.resize(i * INTERVAL);
delta temp = delta(mirror, authors, false);
for (int k = 0; k < temp.sub_delta.size(); ++k) deltas[k] += temp.sub_delta[k];
cout << " [RUN " + to_string(i) + "] -> fDepth: " + to_string(i * INTERVAL) << endl;
}
// prints the run report
generate_output(authors, deltas);
}