Java网络爬虫crawler4j学习笔记 WebURL类

时间:2020-11-26 20:41:10

源代码分析

package edu.uci.ics.crawler4j.url;

import java.io.Serializable;

import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.PrimaryKey;

@Entity // Berkley DB Annotation
public class WebURL implements Serializable {

private static final long serialVersionUID = 1L;

@PrimaryKey
private String url; // 当前页面的url

private int docid; // 为当前网页分配的一个docId
private int parentDocid; // 在网页a的页面上找到指向b的链接,则a是b的parentDocid
private String parentUrl; // 在网页a的页面上找到指向b的链接,则a是b的parentUrl
private short depth; // 爬取深度, 从0开始计数
private String domain; // 当前网页的主域名
private String subDomain; // 当前网页的子域名
private String path; // 当前网页在网站中的资源路径
private String anchor; // 超链接标签中的文本
private byte priority; // 爬取的优先级,越低代表优先级越高
private String tag; // 标签


/**
* @return unique document id assigned to this Url.
*/

public int getDocid() {
return docid;
}

public void setDocid(int docid) {
this.docid = docid;
}

/**
* @return Url string
*/

public String getURL() {
return url;
}

public void setURL(String url) {
this.url = url;

// 从"http://"开始作为domain的起点
int domainStartIdx = url.indexOf("//") + 2;
// 第一个斜杠作为domain的终点,例如”http://www.baidu.com/“
int domainEndIdx = url.indexOf('/', domainStartIdx);
// 有点没有斜杠,如http://www.baidu.com
domainEndIdx = domainEndIdx > domainStartIdx ? domainEndIdx : url.length();
domain = url.substring(domainStartIdx, domainEndIdx);
subDomain = "";
//根据点进行拆分
String[] parts = domain.split("\\.");
if (parts.length > 2) {
// 默认的domain包含两个字段,如www.baidu.com中的baidu.com
domain = parts[parts.length - 2] + "." + parts[parts.length - 1];
int limit = 2;
// 有的包含3个字段,如www.sina.com.cn中的sina.com.cn
if (TLDList.getInstance().contains(domain)) {
domain = parts[parts.length - 3] + "." + domain;
limit = 3;
}
for (int i = 0; i < parts.length - limit; i++) {
// 加上分隔符
if (subDomain.length() > 0) {
subDomain += ".";
}
subDomain += parts[i];
}
}
path = url.substring(domainEndIdx);
// 如果url中带有参数(即含有?),则?之后的不是path
int pathEndIdx = path.indexOf('?');
if (pathEndIdx >= 0) {
path = path.substring(0, pathEndIdx);
}
}

/**
* @return
* unique document id of the parent page. The parent page is the
* page in which the Url of this page is first observed.
*/

public int getParentDocid() {
return parentDocid;
}

public void setParentDocid(int parentDocid) {
this.parentDocid = parentDocid;
}

/**
* @return
* url of the parent page. The parent page is the page in which
* the Url of this page is first observed.
*/

public String getParentUrl() {
return parentUrl;
}

public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}

/**
* @return
* crawl depth at which this Url is first observed. Seed Urls
* are at depth 0. Urls that are extracted from seed Urls are at depth 1, etc.
*/

public short getDepth() {
return depth;
}

public void setDepth(short depth) {
this.depth = depth;
}

/**
* @return
* domain of this Url. For 'http://www.example.com/sample.htm', domain will be 'example.com'
*/

public String getDomain() {
return domain;
}

public String getSubDomain() {
return subDomain;
}

/**
* @return
* path of this Url. For 'http://www.example.com/sample.htm', path will be 'sample.htm'
*/

public String getPath() {
return path;
}

public void setPath(String path) {
this.path = path;
}

/**
* @return
* anchor string. For example, in <a href="example.com">A sample anchor</a>
* the anchor string is 'A sample anchor'
*/

public String getAnchor() {
return anchor;
}

public void setAnchor(String anchor) {
this.anchor = anchor;
}

/**
* @return priority for crawling this URL. A lower number results in higher priority.
*/

public byte getPriority() {
return priority;
}

public void setPriority(byte priority) {
this.priority = priority;
}

/**
* @return tag in which this URL is found, like 'a' , 'href' ,····
* */

public String getTag() {
return tag;
}

public void setTag(String tag) {
this.tag = tag;
}

@Override
public int hashCode() {
return url.hashCode();
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

WebURL otherUrl = (WebURL) o;
return url != null && url.equals(otherUrl.getURL());

}

@Override
public String toString() {
return url;
}
}

测试

Java网络爬虫crawler4j学习笔记 WebURL类