jsoup 是一款 Java 的HTML 解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于JQuery的操作方法来取出和操作数据。今天我们就开始jsoup的学习。
jsoup解析html
jsoup的主要功能如下:
从一个URL,文件或字符串中解析HTML;
使用DOM或CSS选择器来查找、取出数据;
可操作HTML元素、属性、文本;
jsoup测试项目的结构如下,首先要下载jsoup的jar包:https://jsoup.org/download
aaarticlea/png;base64,iVBORw0KGgoAAAANSUhEUgAAAPsAAADHCAIAAABQqLInAAAZA0lEQVR4nO2d6VsT2Z7H/Tvui3lxnxnnTmtPj21fQbq1bTfUsUHtp6/aq623041NX3vEjV1lC6hNFHDBnc3I5oLEACIiggpxbUHbgKwKhCQislRCsOZFQjhVdU6lgCxU6vd5zuNDknOqTnw+qZxK6pvfNHoSKNLPdb1sMnS36rtbntU/TDmWiT7a0dO3aEcmaSwAeIRpExvWrTPsTUh5WhEz0JhsajwwWJ+gq4o8lhi6PSJpe0TS3oSUbp0BjAemIBM0fm9CSt+rcvrNTUwbGRoZHnz9+o0D41WyaX4K7QSnLQTODrQKP9fuERABEzR+Z3QqbSzDN5r+OuEC7XBVA8YDnoBofEVFhVwuJz0aEnGS1l3Ct3cji3Zklmia4s/d+mH/ZeKewXjAE+CNLy8vnzFjxvTp00nDftl2ln6ZY20j7dllF7J2xyq3hGRuCclsePry8u3nC0Iylu7KvvvsZeGlB/hNIEJqFX7TbMhUYw9PY9+pVfhh/rb+NdZ/tAef8fx7hNeFF4Mx3qr7unXreIz/VnaGbj1Bt54Yen4ycq8y5WL9pdYRa7vSNmKg3lm7NXQP/bS1AL8Ju5Cox2OPIdqpZDYvycYzPLf9TTYev0d7Z8zDgNfANt6u+/r163mMX/XNWbrp4ODTQ7+Fnt9X2JT71MRtx6v0P25Ttb98g9/EmGPWgytiJ9vVUQX5jvGcznzHeNwemcBh3lthGI/qzm+8/9rswceJP23Ni8nWZmj6uS1F3f3VltLWjj7SFjgHUuuh2k+hxRrvp9CO13jOHnAvJMIeAa+FYbxcLp+OEB8fTxo2f835rzfnhR1/nl7Zm17Zm36D0RLOt332VVFzO1F3q202x7QKBdtizqpmbDViH+Q3DV3VoC+E0RuMB9BtEvaIdIVFjdcywU8nM/I0f/MvmhWonhV4dVbg1Q8Dr364Sj171dXZq9WfrC9d9bNabxwijbWtIBCpuGed+DNX5IzTTyZjHONlmE2wtoIexDF7xJ3MAt7HBI2nabp/wOTEeUwCONEExsHEjZ8ygPHAOADjAWnhBcYDwDgA4wFpAcYD0gKMB6QFGA9ICzAekBZgPCAtppDxJ2MjPD0FwPvxgPGDhvb7p4Lrjm2qO7bpUda2basXBy3yCVrkE7TYx/pH1NeB7p8VIBE8YHzlvsA3bff6uxr6uxp66su2B85vrErRVh4KWuTTeDOlsfJQ0CIf988KkAjON54/IDuob66K83lVE/76cfzbx7G9mugdq+Y/v654du33oEU+f15P/rM8GYwHXIeTjXcYkK05sLTr4fHepty+5vy+5vywtUtsSxpmS94ahB3OuaQXCbky8k2QYwLwONN4IQHZsujZVLcKbTvXfNZ4M0V746B1VaPlWdVgrhljhlxZURIA4OA04wUGZK+Gz6Je5lnbQEt2U+l2VfyimqMBNUcDiuMWVB8JqD4ScCV2Qd3Z7zUZGzmjOflU1osAwnuAI5xjvPCA7OWdH1AtZ6iWM8ZHyTVHVnZqfqc6L1Gdl6nOIqrrCtV1heoqprpVxmcZ5YkLCNtA8qlgPDBOnGO88IBsYcj7VGOaQRNXcWCh4WEy1ZpBtWZSrVlUWzbVlk215VDt5wyP00qTluib77MHY/KprIUOGldVKUB+gIO7P508/9tM/e2IMvmn+tq9lDaF0qZS2jSqMY1qPEw1HqGajuo1clXcwp4X97DDOflUwk8iQFoVIOBu47OD3yuO9ump3k7Vx1L1cVR9HNUQTzUkUA1y6mmirjq8KGahrgmvOwBMHncbf68wMTN4Ru62WXk7PsrfNacg1Lcw1Lcw1LcgdG5RzJLbmbsGXne6eUqApJhC19UAgBsA4wFpAcYD0gKMB6QFGA9ICzAekBZgPCAtwHhAWojb+Lifv/PczoVftjbOC9yQn8kHnI7IjK9K2ngzceNN67+JG39ZNBu9efvgZjfOBYwXJSIzvjxu46PCqKequD8uRN9O3yz/atGjwshnqrg/LkRXH918PW6DG+fiMuMBVzKFjOcPyFop2f19a216z7Ocds2JB3lhmVu/bL17rOdZTvu9E5qcnWV7vnXPVGmaBuNFylQx3mFA1kpxxDfPKxRtdccbKw/VZm1P+ecK282bKXfObL0a+RV5KDf/ii28o5JN81OobJccy1RjvTjOqmTT/BQKGWMD+NpsmJ5ojSpGDSrmRlxT6VbSTAnjhQRkrVzauf5O5i5NbtSdrNDygz/u/XLB7cxdmtyou9mh137fdCV0HWEcN/9KKBlLIyFxqyrYgoL2nmOP8lQjxPW0H/m5bwDkaoVOqnQraTxvvMCALE3TIWv8N382O27tgtP/WpP8vX/06nlhgQuiV8/bvWZ+8oZlp35dE7RgFv7TG65VpJKxjAdIf3Pv4a+/ietpux/35sF6qTi/0q2k8bDxwgOyNE33vHp5+qcATWFMQ+nv9y/GX0v5JWr1/LqCvY9U+x5eir+esvn0TysN3V2YkYKMtx/Yp47xY7edWulW0njYeOEBWStZmwNba49Zz1w153ZFr55vP3O9fTok+5fPCeO4+VdCyViHxo8tS0jGcyvO4o233js2grs0clmlWynj+VXNuDjz40r7mevtMyHJG5bZz1xvpm/JkK0gjsTkX8lnrhM2Hl9xFtOTUU/ZqiLuZMB1lW4li8iMP/HD8juZu+7lRd/NDis7sOn0v76w31QnbDi1aZmnJ+gMnLb6gGUMBpEZf+ybxUe+WWJve//BuHnyh6WenqATcN5XrmA8BpEZT/UZ+3s67G1r4GL0pmmgz9MTnBy2BYuzLAXjMYjMeBb4T2YAgIy4jQeA8QLGA9ICjAekBRgPSAswHpAWUjH+ctL/cJs6ZX5fD1y3Li2kYvyFxFkms4XVNLkbin7/WN/+wNOzczaQGyQjFePzEj4cMllYrbEiqu78hgsHPu7hl97FGSb0QhnnXAIDxpMRsfHV1dX9/f0CO5+Lm90/ZLG3gMhzG5IuvLgl116Pqj2/ITfRj2+wm1J7KtlETIcvVseHiI3PyclRKpV6vV5I58yYj/oGh+1tTnD6nOB0+83MmI/4BoPxXoS4jdfpdMePH+/o6HDY+fSev/f2D9ub1Xj7zdN7/s43GDHelcFTrvGY2rTMCSC7JlVAZM+YseJhXp0viauKxW08TdN6vV6hUDx//py/84noOYa3ZnuzGm+/eSJ6Dt9gRvDCdcFTlvGsq+o56T6avXXcfVqFDE1csYKNSPaE281LEb3xNE13dnbu2bPHaDTydD4a5dPzxmxtzd39c4LTF247Y7/naBShZrIVRkLKdcFTpvGc89nR2B/P8Rx/H+fwjaZbeLp5J6I33mAwCDnGp0b4dveara32uW5OcHpg9Dn7PakRvjxjCfUEnR485RhPPHkgV7RlbVir8JvGOaAjgUM0hsXt5pWI2/ienh6B63hF2NxOo0ld17I/787ysKw5wenx56o7jSZrU4TNJQ9lZEhdGTzlrmrQcwVS8pX16zbM1xv7DATZt59MZs+Yk7p5I+I2XvhnNft3+XUYTGX32+YEpy/dlRl2+kZjZ3+HwWRt+3fhP53kJjRcGTzlnLnynyazTk3RlDj7zIE1DZojNqmbFyJi48f1ebx8x8dtPRSpyXd87NKpjuKuFYO3r0wmg4iNHxex2z5p0VGkFrvtE7fMwk0mwleuPEjF+N1b5/E3t8zC9cY7OSnrhUjFeACwAsYD0gKMB6QFGA9ICzAekBZgPCAtwHhAWkwh46nuJ1T3E0/PwqMwL1eHD9VdwZQwfsQ8qCvb2XLi467iIMe9mZd3s66mpbn3482ZCvXAsBdlutv4plz/tEj/tNwa2+36pLRUJd+F10KpKYn0T4v0T4v0L6m33tOuSbXdkxbpnxaZ1EQaWp+EjBrDWBmEDPdPi/RXVrbb5p+qFDgtpxkv8CqXJ7peVvvT0PduxNKWsVSvntlyap5lwNGVYZhAg+1u9BpCXl3QyxK1Cj/hajlXRJ6tudV4hlvOMb6mhCG6Ve52TSrGYybtmlT/tNSkEoc9dUql/TVTn+QB4wVeyZh059n3Vxvs7Z/qhn13/qRp2lC9vzP/o5fn5xrrzjrYE8F45NJDR7qgtZjGBxgvDJ1SidmIEOOF9jRWBrHelwROzZnGC0md/mVF1L+vTbS3v34pn/n1PpqmqZ7nL45/qiueoT2+0sGe8MajV0+N6xjPHc4MCY2tfbihUmyNWIHlYHHzdJyIRe4n5lPRPjIZ7nmh8BmvUyoZK5N2Taq/srLd+qCxMsjWk92NZh7j7TjR+JqSyCCNjjFnIdulnWs8LSB12tb1eunpPWhbdnbP5/l7Ps/fo01fqSua8fTQvP42gb8eg67X2UE4Yct4fObI9m7hIFSKy6GOoxws7l7HiVjm/fh8Kmcs63kxIRvfrskdVb8+ybbsRh9NHX0ZcLvZ/mYt1pnrePsRGoMj41lvRB40nhaQOl2dv3uXOoLVPsve2V114sWpOa1ZH7QWhvPtCXeMZx60hS4JbK8MRhSORsTgDZUSc6gCi2Pi5uk4H0h4FTJe3NwwIet5MeBf1SAni1Z3Rw+u6DIa0415v01urMf2E1zkmO3AeM6jHjReSOr0Pw9v8jnxK6v9R+pG85vuBzG+evV/aaJ8LaYB0nBWyg33wwDjWgQzjo22IBEp1crNimK25kbj8flU9mYJz8sGy/ix9bFOqYwcXcMgfjfl+isr242VQchDmG4I9hWOk1Y13L14ynghqdMb4WuVoaux7UbEuieHvuw4P+OPA77ddy8TNkBcr7OSpQ7OXBknvMj20PQnf6gUm0N1qvG4RCznZB2TT8VtFn1eTJjGIyt15GBvrAwaO3jrlMrUpJLU0UMytptOqRxbtNjX3BM1XqdUIu8A1pcco/tU/qxGHbz4YcL/ctu9mOXq4MVdtwuf7Pdpy3nvXuIX3LGctTfmQGhfgGM/qGd2xazzmTlt3lApcyu4xb0g4xnTZB2nMYlY3FO2jmalaTkRWlIqym48+4N5ZNmtzE1CDqvtmlTH3dDPzgmfx6PLGJq7tbGxqPFM+214xHiBn8cXyeY/iFvObXV7lhXJ5pvfvr65Zbah9G83fp3d/+qFs+YmHDf93p7b4Xle7HW8GPGI8QI5GTA9a/2HmLZ21smA6TRNP1D8WLPzw5IN//3iyik3z81rv9rnfV6cQ7vI8Nh3rgJ509nG02ia7m36o73yAvW6x80TQz9L9Ca89XlNmClxXQ0AuA0wHpAWYDwgLcB4QFqA8YC0AOMBaQHGA9JiChnvgpwr8+JIzyLw61xXBP/gl1cRpoTxgnOupGArufeUkJ2maZLxbom6gvEI4sq5koKteKbWRTICjRf0EDBxxJVzJQVb8YDxABdx5VxJwVaaGzlFIkpooAldDXEyo6TcqoKbVcX25K62mEEpoasa50ZdWWEQhzVivRxx5VxJwVZs5JSV8OR2YGVGybnVMQPHXk1Me4ljGaKN33jnRF2x/yX2eyX2XiKunCsh2IqPnHLKsLI7cNZImD4sRbAVLQljWd3Gcebq7Kgr8/WLe474NyCvRFw5V0LojrBgZxjv0DbHuVVe47lj3WO8kKgrajxRbDTO682IK+dKCrZiI6esVQ23Azfzz59bHTWe0VOlGD1M4sYy04ITWNU4JepKfGu0vYI4cV5vRkw5V3KwFX/yxTiiEU/vWJtn9cEaj/ZkBWCZZ3/oWaKCeIxnLDJcEXVln/Bj/qM4U/daIOcKSAvIuQLSAnKugLSYEtfVAIDbAOMBaQHGA9ICjAekBRgPSAswHpAWYDwgLURn/Gh0VYFeuyuBL8cBJ+EB4wcN7bVp391KCriVFFCb9t2god3xmFGYV52A8cC48YDxlUmBQ3rtyFDPyFDPm7Z7t1O/Ez4Wd7krGA+MA+cbX1FRIZfLSY8O6pur4nxeVYe/fhz/9nFsryb61bXtj45/WZeysi5l5cPT35v6+a6+BOOBSeJk48vLy2fMmDF9+nRSh5oDS7seHu9tyu1rzu9rzu97kderzX2tPf+m+eKb5ov6hszatDWksYzoKuOab74QJwCgONN4q+7r1q3jMb4sajbVreJpZVGzeXZBqGxJDnECABOnGW/Xff369TzGXw2fRXXkWdtAc7b26v/VnlhVdXBh1cGFVYqFVYqF1+Vzbx1afuvQ8luHVnCH8xlPSLsCAIpzjEd15zf+8s4PqJYzVMsZ46Pk6sPLu+8fpDovUZ2Xqc7LVFcR1XWF6iqmulXGZxnliQu4wx0YD44DjnCO8XK5fDpCfHw8qWdhyPtUY5pBE1dxYKHhYTLVmkG1ZlCtmVRbFtWWTbXlUO3nDI9TS5OW6Jvvc4c7WtVg0q4AgOLuTyfP/zZTfzu8TP6pvnYvpU2htCmUNpVqTKMaD1ONR6imo3pNgipuYc+Le9jhvMZL7KeGgAnhbuOzg98rjvbpqd5O1cdQ9bFUfRzVEE81JFANcuppoq467ErsYl0TXncAmDzuNv7ehQOZwTNyt83K2/FR/q45BaG+haG+haG+BaG+xbH+d3PCB153unlKgKQQ3XU1ADApwHhAWoDxgLQA4wFpAcYD0gKMB6QFGA9IC5EZf0UxD9s8PS9ANIjPeJPZwmpPLgc5kN595R0nfzkbBFxci8iMv5g8nzJZWO15eUQ9v/RgPDCKyIzP379gkBpmtZY7yc/LI55c/jl/P+YCY/eCNX5c8RQw3rWIzPhzSQv7Bobp6OihkO09P/zUErD+7eCwvZ3bt9DTEwTjpzoiMz5Dvqh3YLi3H98y5IvwwxDl+KvC4EvmOK6xaodR/3WaTMXYuL0CoMo2CfRxpGQVGO9CRGb8ybglhj6zsuThkbzbEUdKN8XkG/rMq7dnGvrMhj7zybgl+GF8lSGZhVbtqRKy8bgaq4ytcX5Rh12ikFFkmV24CYx3LSIz/sjepbpeE6kd2bsUP4xZ7I5dVZuxDMHV9XVccZLGbc5eKY1lvL0D9m8w3rWIzPiD0cteGU10dPTQ1u2dRlPMoUtrthz/68q4TqOp02g6GL0MP4xtJlK7FGM8p7wxGO9FiMz4/ZErOvQmUjsQifn5A5pmrGo4tUs5q5oxS5GAIX+NVcZPAoLxUxqRGZ8QtrJVR115fPL0nZTdpbHf5Ib4Htv4l31ftOqoVh2VELYSPwxRDnfWiTtzRc5x2XVSuTVWHRg/uil27Xow3gOIzPiYnZ83d1MvuvAtZufn+GFO+70m+OEn0SMy46O2BzR2DpFa9LYA7CjnfeUKxosekRkfHrIqfNuq8G2BYSGB1j9G/w4MD1kVHhKIHeW83/IA40WPyIwHgEkCxgPSAowHpAUYD0gLMB6QFmA8IC3AeEBaiMv40W/g8RfiAoBjxFXPFYwHJou46rlyr7IC44HxIa56rmA8MFnEVM+VuaoRGDwFAAbiqueKN543eAoADMRVz5V0jLd3gEUO4ABx1XMF44HJIq56roRVDTd4CgAExFXPlXCM5wZPAYAA1HMFpAXUcwWkhbiuqwGAyQLGA9ICjAekBRgPSAswHpAWYDwgLcRk/A0y169fLy8v9/QEAREgMuMtw8MWy/DIiGVkxPLu3ci7dyMWy/C7dyO1tbWlpaXXyq95eo7AVEdMxldUVAwPm7nSv3s3Ul1dPTg4qFarKyoqMCPdV93SxXjNE/EcYjK+vLzcbDZhpS8tLVWr1Wq1urS0FDPSI6IgJaFw82HWO8M/yhkOxk8aMRl/7do1s4mqvP+stqG5rqGlrqGltr65rqGltqH55oM/X7182dLcjDfe3WgVfoxCC0yQ3AqmGKZKhpZcgGvjnI2YjC8tLTVRQzfqnlgsFovFYjabX9ecNpvNFovlRt0T68FerVZ7epp2CMIyLB+rI0LYAhzRnYyYjFer1dTQ4NWqexRFDQ0N9ZQk6W6def3gMkVRV6vu8RnvvnquhL2isI3n2Qgnyc6eFXbSSAFNgIOYjC8uLh4aHCwovdXX19fb29tdHK9LW6bX641GY0HpLeuavri4GDPSffVcsXvl3s2oSkXaBGbJg85EhpSQwk4PwCEO41fEFqBNp9OdyC85e/TAmawsVWVtSk7x2QslI5bhkRFLUVERZrzL67naj/rotsjHb6S7QoY/GuNfTcxNco7ysPB3jDiMf39TKto6OjrUN2sTThXKT124dqsuNaf4SPZFi2XYYhm+dOkSZrz76rny7RUHvrIf8c2D8WaFVt8E44UiDuNpmv723Ea05apvZRXdsLaurq7UzELrp5YFBQWYwa6u54qHe0Dmfi6D7oOzY5ozFp31aCfW9MB4fkRj/AeH/4G2Qxn59paSURB7+Kz1o/r8/HzMYFfXc8XtD2V0mTTmNHsNZH8UnYu9C9d4dCOs6YHxvIjD+JqQudw2ONA/NDAwNDhIDQ2ZKMpsosxm08mTJzHjvaaeq6f37wWIw/hK2cyG/SvQVimbOdD/lin9kNlEKRQK7nCvqecKX7lOHnEYX7bu37it/+1brvTYn8rxhnquttUOHOAniziMtxImDE9PE5jSiMl4AJg8YDwgLcB4QFqA8YC0+H8m1WPh/iHunAAAAABJRU5ErkJggg==" alt="" />
一、 JsoupTest中我们从网址、文件和字符串中解析html。
package com.huhx.jsoup; import java.io.File;
import java.io.IOException; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements; public class JsoupTest {
static String html = "<html><head><title>First parse</title></head>"
+ "<body><p id='parseHtml'><font>Parsed HTML into a doc.</font></p></body></html>"; static String fileName = "file/jsoup.html";
static String url = "http://www.baidu.com"; // 从url中解析
public static void readFromUrl(String url) {
try {
Document document = Jsoup.connect(url).get();
System.out.println(document.title());
} catch (IOException e) {
e.printStackTrace();
}
} // 从文件中解析
public static void readFromFile(String file) {
File input = new File(file);
try {
Document document = Jsoup.parse(input, "UTF-8", "");
System.out.println(document.getElementsByTag("p").text()); // 通过tag名得到元素
System.out.println(document.getElementById("divid").text()); // 通过id
System.out.println(document.getElementsByClass("divclass").attr("id")); // 通过class
System.out.println(document.getElementsByAttribute("href").attr("id")); // 通过属性 } catch (IOException e) {
e.printStackTrace();
}
} // 从字符串中解析
public static void readFromString(String string) {
Document document = Jsoup.parse(string);
Elements element = document.getElementsByTag("p");
System.out.println(element.text());
System.out.println(element.html());
System.out.println(element.attr("id"));
} public static void main(String[] args) {
readFromString(html);
System.out.println("------------------------------------------------------------");
readFromFile(fileName);
System.out.println("------------------------------------------------------------");
readFromUrl(url);
}
}
二、 jsoup.html的内容如下:
<!doctype html>
<html lang="en">
<head>
<title>Document</title>
</head>
<body>
<p>
<font>Hello World.</font>
</p>
<div id="divid">huhx div id</div>
<div class="divclass" id="divclassid">huhx div class</div>
<a href="http://huhx.com" id="huhx">Hello huhx</a>
</body>
</html>
三、运行结果如下:
Parsed HTML into a doc.
<font>Parsed HTML into a doc.</font>
parseHtml
------------------------------------------------------------
Hello World.
huhx div id
divclassid
huhx
------------------------------------------------------------
百度一下,你就知道