/** * See http://en.wikipedia.org/wiki/URL_normalization(具体文档) for a reference Note: some * parts of the code are adapted from: http://*.com/a/4057470/405418 * * @author Yasser Ganjisaffar [lastname at gmail dot com] */ //有些url其实是等价的,网络爬虫为了判断某个网页是否反复爬取,需要对其进行canonicalizer(规范化)表示 publicclassURLCanonicalizer {
String host = canonicalURL.getHost().toLowerCase(); if (host == "") { // This is an invalid Url. returnnull; }
String path = canonicalURL.getPath();
/* * Normalize: no empty segments (i.e., "//"), no segments equal to * ".", and no segments equal to ".." that are preceded by a segment * not equal to "..". */ path = new URI(path.replace("\\", "/")).normalize().toString();
int idx = path.indexOf("//"); while (idx >= 0) { path = path.replace("//", "/"); idx = path.indexOf("//"); }
while (path.startsWith("/../")) { path = path.substring(3); }
/** * Takes a query string, separates the constituent name-value pairs, and * stores them in a SortedMap ordered by lexicographical order. * * @return Null if there is no query string. */ privatestatic SortedMap<String, String> createParameterMap(final String queryString) { if (queryString == null || queryString.isEmpty()) { returnnull; } // 按&切分出所有的参数键值对 final String[] pairs = queryString.split("&"); final Map<String, String> params = new HashMap<>(pairs.length);
for (final String pair : pairs) { if (pair.length() == 0) { continue; }