用C++实现一个小小的爬虫

时间:2022-04-22 08:33:54

        



        先给你一个入口网站,发送http请求头接收返回的内容放入URL txt文件中,然后在加入到搜索过的链表中,放入到搜索url txt文件中,分析html内容,找出其中的超链,把超链放入待搜索队列中,最后循环以上步骤直到待搜索队列没有内容。


编译环境Visual Studio


#include <iostream>
#include <WinSock2.h>
#include <errno.h>
#include <cstring>
#include <iomanip>
#include <fstream>
#include <queue>
#include <algorithm>




#pragma comment(lib,"ws2_32.lib")
using namespace std;


#define DEFUAL_PORT 80
#define DEFUAL_ARRAY_SIZE 1048576
#define URL "http://www.baidu.com/"
#define SAVE_HTML_DATA_DIRECTORY "./html"//存放html文本的文件夹






//截取HostUrl中的内容直到Sign处
char *InterceptString(char *&HostUrl,char Sign)
{
if(HostUrl == NULL)
return NULL;
int size = 100;
int nIndex = 0;
char *string = new char[size];
char *MarkString = string;
char *NewStr = NULL;
while(*HostUrl != Sign && *HostUrl != '\0')
{
*string++=*HostUrl++;
nIndex++;
if(nIndex +1 == size)
{
*string = '\0';
size += 100;
NewStr = new char[size];
strcpy_s(NewStr,size,MarkString);
free(MarkString);
MarkString = string = NewStr;
string += nIndex;
}
}
*string = '\0';
return MarkString;
}
//解析URL 分离出主机 和 资源 分别放入host和resource中
bool ParseUrl(char *HostUrl,char *&resource,char *&host)
{
if(HostUrl == NULL)
return false;
if( strstr(HostUrl,"http://") != NULL)//判定URL是否是标准格式
HostUrl +=7;
if(*HostUrl != 'w')//格式错误 
return false;
//分离 主机 和 资源
if(!(host = InterceptString(HostUrl,'/')))
return false;
if(*HostUrl == '\0'){
resource = new char[2];
resource[0] = '/';
resource[1] = '\0';
}
else
if(!(resource = InterceptString(HostUrl,'\0')))
return false;

//cout<<host<<endl<<resource<<endl;
return true;
}
//发送http请求报文
bool SendHttp(char *HostUrl,char *&Htmlresource,int &ByteRead)
{
char *resource = NULL;
char *host = NULL;
if( !ParseUrl(HostUrl,resource,host) ){
cout<< "Parse Url fail !" << endl;
return false;
}
//创建套接字
SOCKET sock = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);
if(sock == -1 || sock == -2){
cout<<"socket error ! error number === "<<GetLastError()<<endl;
return false;
}
//通过域名得到对应ip地址(此处需要联网)
hostent *p_TargetHost_ip = gethostbyname(host);
if(p_TargetHost_ip == NULL){
cout<<"gethostbyname error ! error number === "<<GetLastError()<<endl;
return false;
}


sockaddr_in TargetHost;
TargetHost.sin_family = AF_INET;
TargetHost.sin_port = htons(DEFUAL_PORT);
TargetHost.sin_addr.s_addr = *(u_long*)p_TargetHost_ip->h_addr_list[0];

if( connect(sock,(sockaddr*)&TargetHost,sizeof(TargetHost))  < 0){
cout<<"connect error ! error number ===  "<<GetLastError()<<endl;;
return false;
}
//定义发送报文内容
char message[1024] = {0};//此处不应该是固定大小--------------------
char *StringInputFormat = "GET %s HTTP/1.1\r\nHost:%s\r\nConnection:Close\r\n\r\n" ;


sprintf_s(message,1024,StringInputFormat,resource,host);


//发送请求包
size_t sent = 0;
int tmpress;
while(sent < strlen(message)){
tmpress = send(sock,message+sent,strlen(message)-sent,0);


if(tmpress == SOCKET_ERROR){
cout<< "send message fail ! error ===" << GetLastError() << endl;
return false;
}
sent += tmpress;
}
//接收回复包
//char arr[DEFUAL_ARRAY_SIZE] = {0};//错误 数组不能定义这么大,所以采用new的方式
//另外直接开固定数组也容易越界
int Message_Byte = 100000;//这里应该变成可变的大小-----------
char *ReplyMsg = new char[Message_Byte];
memset(ReplyMsg , 0 ,Message_Byte);
ByteRead = 0;
int rempress = 1;
cout << "Read :" ;
while(rempress > 0){
rempress = recv(sock,ReplyMsg+ByteRead,Message_Byte-ByteRead,0);
if(rempress > 0){
ByteRead += rempress;
}
cout << rempress << " ";
}
ReplyMsg[ByteRead] = 0;


//cout<<ReplyMsg << endl;
cout<<ByteRead <<endl;


Htmlresource = ReplyMsg;


free(resource);
resource = NULL;
free(host);
host = NULL;
closesocket(sock);
return true;
}
//将URL转化为文件名------此处应该改成附加一个变量的,如果为0 加上txt
char *UrlTranformateFilename(char *Url)
{
if(Url == NULL)
return NULL;
char *string = new char[strlen(Url)+5];//Url长度,加上.txt四个字符
char *MarkString = string;
while(*Url != '\0'){
if(*Url != ':'&&*Url != '/')
*string++=*Url;
Url++;
}
char *str_text = ".txt";
while(*str_text != '\0')
{
*string++=*str_text++;
}
*string = '\0';
return MarkString;
}
//将得到的内容放入txt文件中
bool InputFile(char *&Url,char *&Htmlresource)
{
char *Filename = NULL;
if(!(Filename = UrlTranformateFilename(Url))){//将URL转化为文件名
cout<< "UrlTranformateFilename fail ! Url ====" <<Url<< endl;
return false;
}
//通过文件名加上路径创建txt文件
char *path = SAVE_HTML_DATA_DIRECTORY;
char *Filepath = new char[strlen(Filename) +strlen(path)+2];
strcpy_s(Filepath,strlen(Filename)+strlen(path)+2,path);
strcat_s(Filepath,strlen(Filename)+strlen(path)+2,"/");
strcat_s(Filepath,strlen(Filename)+strlen(path)+2,Filename);


cout<<Filepath<<endl;
//在html文件夹中创建txt文件并将内容写入
ofstream File(Filepath);//创建txt文件
if(File.is_open()){//打开文件,这种打开方式会清空原文件内容,重新写入
File<<Htmlresource<<endl;
File.close();
}
//及时删除,防止内存泄漏
free(Filepath);
Filepath = NULL;
return true;
}
//解析HTML内容
bool ParseHtml(char *Htmlresource,queue<char*> &SearchQueue,queue<char*> &NotSearchQueue)
{
//查找html中body 得到其中的body内容
char *str_body = "<body";
char *Html = NULL;
if(!(Html = strstr(Htmlresource,str_body))){
cout<< "this Html error ! " << endl;
return false;
}
//分析html内容 找到其中的超链 如果不在已搜索队列中,加入待搜索队列
char *str_target = "href=\"";
char *str_hyperlink = NULL;
if(!(str_hyperlink = strstr(Html,str_target))){
cout<< "this html no hyperlink !" << endl;
}
while(str_hyperlink){
str_hyperlink += strlen(str_target);
char *str = strstr(str_hyperlink,"\"");
char *Url = InterceptString(str_hyperlink,'"');
NotSearchQueue.push(Url);//此处应该判断是否遍历过
str_hyperlink = strstr(str_hyperlink,str_target);
}

return true;
}


void BFS(char *Url,queue<char*> &SearchQueue,queue<char*> &NotSearchQueue)
{
//先发送http得到其中资源
char *Htmlresource = NULL;
int Byte = 0;
if(!SendHttp(Url,Htmlresource,Byte)){
cout<< "SendHttp  fail ! return !" <<endl;
return ;
}
if(!InputFile(Url,Htmlresource)){//输入到文件失败不需要返回false 还可以继续爬
cout<< "InputFile fail ! ignore !"<<endl;
}
//cout<<Htmlresource<<endl;


//解析html
if(!ParseHtml(Htmlresource,SearchQueue,NotSearchQueue)){
cout<<"ParseHtml fail ! return "<<endl;
return ;
}


}
bool Init()
{
WSADATA wsaData;
if( WSAStartup( MAKEWORD(2,2),&wsaData) != 0)
{
printf("WSAStartup error ! error number === %d \n",GetLastError());
return false;
}
//创建存储html文本和html中图片的文件夹
CreateDirectory(TEXT(SAVE_HTML_DATA_DIRECTORY),NULL);
return true;
}


void Print(queue<char*> NotSearchQueue)
{
while(!NotSearchQueue.empty()){
cout<<NotSearchQueue.front()<<endl;
NotSearchQueue.pop();
}
}
int main(int argc,char *argv[])
{
if(!Init()){
cout<< "Init() fail ! " <<endl;
return -1;
}
char *UrlStart = URL;
cout<<UrlStart<<endl;
queue<char *> SearchQueue;
queue<char *> NotSearchQueue;
BFS(UrlStart,SearchQueue,NotSearchQueue);
SearchQueue.push(UrlStart);
//Print(NotSearchQueue);
//当未搜索队列不为空时候,继续
while(!NotSearchQueue.empty()){
cout<<NotSearchQueue.front()<<endl;
BFS(NotSearchQueue.front(),SearchQueue,NotSearchQueue);
SearchQueue.push(NotSearchQueue.front());
NotSearchQueue.pop();
}
WSACleanup();
return 0;
}