Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
bighomework : main.o getcontent.o dfs.o datechange.o dealwith.o main.cpp getcontent.cpp dfs.cpp datechange.cpp dealwith.cpp
g++ -o bighomework main.o getcontent.o dfs.o datechange.o dealwith.o -std=c++11
main.o : main.cpp head.h
g++ -c main.cpp
dfs.o : dfs.cpp head.h
g++ -c dfs.cpp
datechange.o : datechange.cpp head.h
g++ -c datechange.cpp
getcontent.o : getcontent.cpp head.h
g++ -c getcontent.cpp
dealwith.o : dealwith.cpp head.h
g++ -c dealwith.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "head.h"
int datechange(string date)
{
int loc1,loc2,m_date = 0;
char middle[9];
for(int i = 0;i < 4;i++)
middle[i] = date[i];
loc1 = date.find("月");
if(loc1 == 7)
{
middle[4] = '0';
middle[5] = date[6];
}
else if(loc1 == 8)
{
middle[4] = date[6];
middle[5] = date[7];
}
loc2 = date.find("日");
if(loc2-loc1 == 3)
{
middle[6] = '0';
middle[7] = date[loc1+2];
}
else if(loc2-loc1 == 4)
{
middle[6] = date[loc1+2];
middle[7] = date[loc1+3];
}
for(int i = 0;i < 8;i++)
{
m_date = m_date*10+int(middle[i]-48);
}
return m_date;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#include "head.h"
void dealwith12(string command,string file,int Len,int loc)
{
string name = "../data/";
string stringdata;
int intdata;
int Loc,Loc1,Loc2;
int i = 0;
while(command[loc+Len+i] <= '9'&&command[loc+Len+i] >= '0')
{
name += command[loc+Len+i];
i++;
}
name += ".html";
ofstream foutdata;
foutdata.open(name);
foutdata << name << endl;
Loc = file.find("<td class=\"t-title\">失踪人姓名</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
foutdata << stringdata << endl;
Loc = file.find("<td class=\"t-title\">性别</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
if(stringdata == "男")
foutdata << 0 << endl;
else
foutdata << 1 << endl;
Loc = file.find("<td class=\"t-title\">失踪人籍贯</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
foutdata << stringdata << endl;
Loc = file.find("<td class=\"t-title\">出生日期</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
int birthday;
birthday = datechange(stringdata);
foutdata << birthday << endl;
Loc = file.find("<td class=\"t-title\">失踪日期</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
int lostday;
lostday = datechange(stringdata);
foutdata << lostday << endl;
Loc = file.find("<td class=\"t-title\">失踪时身高</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
int height = 0;
for(int j = 0;j < Loc2-Loc1-4;j++)
{
height = height*10 + int(stringdata[j]-48);
}
foutdata << height << endl;
Loc = file.find("<td class=\"t-title\">失踪地点</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
foutdata << stringdata << endl;
Loc = file.find("<td class=\"t-title\">可能去向</td>");
Loc1 = file.find("<td>",Loc+1);
Loc2 = file.find("</td>",Loc1+1);
stringdata = file.substr(Loc1+4,Loc2-Loc1-4);
foutdata << stringdata << endl;
Loc = file.find("<td class=\"t-title\">详细信息</td>");
Loc1 = file.find("<td",Loc+1);
Loc2 = file.find("</span></p></td>",Loc1+1);
stringdata = file.substr(Loc1+3,Loc2-Loc1-3);
string prestringdata;
for(int k = 0 ; k < Loc2-Loc1-3;k++)
{
if(!(stringdata[k] < 48&&stringdata[k] >= 0&&stringdata[k] > 57&&stringdata[k] < 128))
{
prestringdata += stringdata[k];
}
}
ofstream foutinput;
ifstream finca;
string _part,all;
foutinput.open("input.txt");
foutinput << prestringdata << endl;
system("python take_apart.py");
finca.open("cache.txt");
while(getline(finca,_part))
{
all += _part;
}
foutdata << all << endl;
}
void dealwith3(string command, string file, int Len, int loc)
{
string name = "../data/";
string article = "<div class=\"article-content\">";
string stringdata;
string information;
int intdata,len1;
int Loc,Loc1,Loc2;
int i = 0;
while(command[loc+Len+i] <= '9'&&command[loc+Len+i] >= '0')
{
name += command[loc+Len+i];
i++;
}
name += ".html";
ofstream foutdata;
foutdata.open(name);
foutdata << name << endl;
Loc = file.find(article);
information = "姓名: ";
Loc1 = file.find(information);
len1 = information.length();
Loc2 = file.find(",",Loc1+1);
stringdata = file.substr(Loc1+len1,Loc2-Loc1-len1);
foutdata << stringdata << endl;
Loc1 = file.find(" ",Loc2+1);
stringdata = file.substr(Loc1+1,1);
if(stringdata == "男")
foutdata << 0 << endl;
else
foutdata << 1 << endl;
//籍贯
information = "出生于";
Loc1 = file.find("出生于",Loc1+1);
len1 = information.length();
Loc2 = file.find("日",Loc1+1);
stringdata = file.substr(Loc1+len1,Loc2-Loc1-len1+1);

Loc1 = file.find("</div>",Loc+1);
stringdata = file.substr(Loc1+3,Loc2-Loc1-3);
string prestringdata;
for(int k = 0 ; k < Loc2-Loc1-3;k++)
{
if(!(stringdata[k] < 48&&stringdata[k] >= 0&&stringdata[k] > 57&&stringdata[k] < 128))
{
prestringdata += stringdata[k];
}
}
ofstream foutinput;
ifstream finca;
string _part,all;
foutinput.open("input.txt");
foutinput << prestringdata << endl;
system("python take_apart.py");
finca.open("cache.txt");
while(getline(finca,_part))
{
all += _part;
}
foutdata << all << endl;


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include "head.h"

void crawled::deep_search(void)
{
url_to_be_crawled.push(_root);
while(!url_to_be_crawled.empty())
{
current_url = url_to_be_crawled.front();
url_to_be_crawled.pop();
if(url_already_be_crawled.find(current_url) == url_already_be_crawled.end())
{
number++;
getContent(current_url);
}
url_already_be_crawled.insert(current_url);
}
cout << number << endl;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "head.h"
void crawled::getContent(string url)
{
string command;
string infloc = "https://www.zgszrkdak.cn/home/person/show/id/";
string infloc2 = "https://www.zgszrkdak.cn/home/family/show/id/";
string infloc3 = "https://www.zgszrkdak.cn/home/news/index/classid/";
command = "wget -O url_has_been_crawled.txt --tries=2 --timeout=3 \"";
command = command + url + "\"";
sleep(0.4);
system(command.data());
ifstream fin;
ofstream fout;
string file,file_part;
int length = 0,len = 0,len1 = 0,location_first = 0,location_end = 0;
string url_format1 = "href=";
string url_format2 ;
fin.open("url_has_been_crawled.txt");
fout.open("all_different_urls.txt");
if(fin.is_open() == false)
{
cerr << "Can't open file!\n";
exit(EXIT_FAILURE);
}
while(getline(fin,file_part))
{
file += file_part;
}
if(command.find(infloc) != string::npos||command.find(infloc2) != string::npos)
{
int loc ,Len;
loc = command.find(infloc);
Len = infloc.length();
dealwith12(command,file,Len,loc);
}
if(command.find(infloc3) != string::npos)
{
int loc ,Len;
loc = command.find(infloc);
Len = infloc3.length();
//dealwith3(command,file,Len,loc);
}
location_first = file.find(url_format1);
if(location_first == string::npos)
return;
else
{
while(location_first != string::npos)
{
string url_to_be_finded;
url_format2 = file[location_first+5];
location_end = file.find(url_format2,location_first+6);
len = location_end-location_first-6;
url_to_be_finded = file.substr(location_first+6,len);
if(url_to_be_finded.find(_root) != string::npos)
{
if(url_already_be_crawled.find(url_to_be_finded) == url_already_be_crawled.end())
{
fout << url_to_be_finded << endl;
url_to_be_crawled.push(url_to_be_finded);
}
}
if(url_to_be_finded.find("http://") == string::npos)
{
int loc = url_to_be_finded.find("/");
string url_to_be_finded_part;
if(loc == 0)
url_to_be_finded = _root+url_to_be_finded;
else if(loc == string::npos)
url_to_be_finded = _root + "/" + url_to_be_finded;
else
{
url_to_be_finded_part = url_to_be_finded.substr(0,loc);
url_to_be_finded = _root + "/" + url_to_be_finded_part;
}
if(url_already_be_crawled.find(url_to_be_finded) == url_already_be_crawled.end())
{
fout << url_to_be_finded << endl;
url_to_be_crawled.push(url_to_be_finded);
}
}
location_first = file.find(url_format1,location_first+1);
url_to_be_finded.erase(0,200);
}
}
return;
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <iostream>
#include <fstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <cctype>
#include <vector>
#include <list>
#include <map>
#include <set>
#include <queue>
#include <regex>
#include <time.h>
#include <stdlib.h>
using namespace std;
int datechange(string date);
void dealwith12(string command,string file,int Loc ,int loc);
void dealwith3(string command,string file,int Loc ,int loc);
class crawled
{
private:
int number ;
string _root ;
string current_url ;
queue<string> url_to_be_crawled;
set<string> url_already_be_crawled;
void getContent(string);
void deep_search(void);
public:
crawled()
{
number = 0;
}
crawled(string root)
{
number = 0;
_root = root;
deep_search();
}
};



Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "head.h"
int main()
{
int time,time_beg,time_end;
string root;
time_beg = clock();
cout << "Please input the name of url that you are going to crawl" << endl;
cin >> root;
crawled website = crawled(root);
time_end = clock();
time = time_end-time_beg;
cout << time << endl;
return 0;
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#encoding=utf-8
import jieba
import jieba.posseg as pseg
filename = "input.txt"
database = "cache.txt"
f = open(filename,"r")
fn = open(database,"w+")
line = f.read()
words = pseg.cut(line)
for word in words:
print >>fn,str(word)
f.close()
fn.close()
Loading