Java网络爬虫crawler4j学习笔记 WebURL类

时间:2020-11-26 20:41:10


package edu.uci.ics.crawler4j.url;


import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.PrimaryKey;

@Entity // Berkley DB Annotation
public class WebURL implements Serializable {

private static final long serialVersionUID = 1L;

private String url; // 当前页面的url

private int docid; // 为当前网页分配的一个docId
private int parentDocid; // 在网页a的页面上找到指向b的链接,则a是b的parentDocid
private String parentUrl; // 在网页a的页面上找到指向b的链接,则a是b的parentUrl
private short depth; // 爬取深度, 从0开始计数
private String domain; // 当前网页的主域名
private String subDomain; // 当前网页的子域名
private String path; // 当前网页在网站中的资源路径
private String anchor; // 超链接标签中的文本
private byte priority; // 爬取的优先级,越低代表优先级越高
private String tag; // 标签

* @return unique document id assigned to this Url.

public int getDocid() {
return docid;

public void setDocid(int docid) {
this.docid = docid;

* @return Url string

public String getURL() {
return url;

public void setURL(String url) {
this.url = url;

// 从"http://"开始作为domain的起点
int domainStartIdx = url.indexOf("//") + 2;
// 第一个斜杠作为domain的终点,例如”“
int domainEndIdx = url.indexOf('/', domainStartIdx);
// 有点没有斜杠,如
domainEndIdx = domainEndIdx > domainStartIdx ? domainEndIdx : url.length();
domain = url.substring(domainStartIdx, domainEndIdx);
subDomain = "";
String[] parts = domain.split("\\.");
if (parts.length > 2) {
// 默认的domain包含两个字段,如中的
domain = parts[parts.length - 2] + "." + parts[parts.length - 1];
int limit = 2;
// 有的包含3个字段,如中的
if (TLDList.getInstance().contains(domain)) {
domain = parts[parts.length - 3] + "." + domain;
limit = 3;
for (int i = 0; i < parts.length - limit; i++) {
// 加上分隔符
if (subDomain.length() > 0) {
subDomain += ".";
subDomain += parts[i];
path = url.substring(domainEndIdx);
// 如果url中带有参数(即含有?),则?之后的不是path
int pathEndIdx = path.indexOf('?');
if (pathEndIdx >= 0) {
path = path.substring(0, pathEndIdx);

* @return
* unique document id of the parent page. The parent page is the
* page in which the Url of this page is first observed.

public int getParentDocid() {
return parentDocid;

public void setParentDocid(int parentDocid) {
this.parentDocid = parentDocid;

* @return
* url of the parent page. The parent page is the page in which
* the Url of this page is first observed.

public String getParentUrl() {
return parentUrl;

public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;

* @return
* crawl depth at which this Url is first observed. Seed Urls
* are at depth 0. Urls that are extracted from seed Urls are at depth 1, etc.

public short getDepth() {
return depth;

public void setDepth(short depth) {
this.depth = depth;

* @return
* domain of this Url. For '', domain will be ''

public String getDomain() {
return domain;

public String getSubDomain() {
return subDomain;

* @return
* path of this Url. For '', path will be 'sample.htm'

public String getPath() {
return path;

public void setPath(String path) {
this.path = path;

* @return
* anchor string. For example, in <a href="">A sample anchor</a>
* the anchor string is 'A sample anchor'

public String getAnchor() {
return anchor;

public void setAnchor(String anchor) {
this.anchor = anchor;

* @return priority for crawling this URL. A lower number results in higher priority.

public byte getPriority() {
return priority;

public void setPriority(byte priority) {
this.priority = priority;

* @return tag in which this URL is found, like 'a' , 'href' ,····
* */

public String getTag() {
return tag;

public void setTag(String tag) {
this.tag = tag;

public int hashCode() {
return url.hashCode();

public boolean equals(Object o) {
if (this == o) {
return true;
if (o == null || getClass() != o.getClass()) {
return false;

WebURL otherUrl = (WebURL) o;
return url != null && url.equals(otherUrl.getURL());


public String toString() {
return url;


Java网络爬虫crawler4j学习笔记 WebURL类