废话不多说,直接上代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/*
* File: main.cpp
* Author: yangchao
*
*/
#include <iostream>
#include <string>
#include <netdb.h>
#include <string.h>
#include <stdlib.h>
using namespace std;
void parseHostAndPagePath( const string url,string &hostUrl,string &pagePath){
hostUrl=url;
pagePath= "/" ;
int pos=hostUrl.find( "http://" );
if (-1!=pos)
hostUrl=hostUrl.replace(pos,7, "" );
pos=hostUrl.find( "https://" );
if (-1!=pos)
hostUrl=hostUrl.replace(pos,8, "" );
pos=hostUrl.find( "/" );
if (-1!=pos)
{
pagePath=hostUrl.substr(pos);
hostUrl=hostUrl.substr(0,pos);
}
}
string getPageContent( const string url){
struct hostent *host;
string hostUrl,pagePath;
parseHostAndPagePath(url,hostUrl,pagePath);
if (0==(host=gethostbyname(hostUrl.c_str())))
{
cout<< "gethostbyname error\n" <<endl;
exit (1);
}
struct sockaddr_in pin;
int port=80;
bzero(&pin, sizeof (pin));
pin.sin_family=AF_INET;
pin.sin_port=htons(port);
pin.sin_addr.s_addr=(( struct in_addr*)(host->h_addr))->s_addr;
int isock;
if ((isock=socket(AF_INET,SOCK_STREAM,0))==-1)
{
cout<< "open socket error\n" <<endl;
exit (1);
}
string requestHeader;
requestHeader= "GET " +pagePath+ " HTTP/1.1\r\n" ;
requestHeader+= "Host: " +hostUrl+ "\r\n" ;
requestHeader+= "Accept: */*\r\n" ;
requestHeader+= "User-Agent: Mozilla/4.0(compatible)\r\n" ;
requestHeader+= "connection:Keep-Alive\r\n" ;
requestHeader+= "\r\n" ;
if (connect(isock,( const sockaddr*)&pin, sizeof (pin))==-1){
cout<< "connect error\n" <<endl;
exit (1);
}
if (send(isock,requestHeader.c_str(),requestHeader.size(),0)==-1){
cout<< "send error\n" <<endl;
exit (1);
}
struct timeval timeout={1,0};
setsockopt(isock,SOL_SOCKET,SO_RCVTIMEO,( char *)&timeout, sizeof ( struct timeval));
char c;
bool flag= true ;
while (recv(isock,&c,1,0)>0){
if ( '\r' ==c){
continue ;
} else if ( '\n' ==c){
if ( false ==flag)
break ;
flag= false ;
} else {
flag= true ;
}
}
int len,BUFFER_SIZE=512;
char buffer[BUFFER_SIZE];
string pageContent= "" ;
while ((len=recv(isock,buffer,BUFFER_SIZE-1,0))>0){
buffer[len]= '\0' ;
pageContent+=buffer;
}
return pageContent;
}
int main( int argc, char ** argv) {
cout<<getPageContent( "http://www.hao123.com" )<<endl;
return 0;
}
|
以上这篇linux c++模拟简易网络爬虫实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。