用jsoup解析word文档转变的html项目实例

时间:2022-11-01 09:22:06

本文是记录,分析一套word文档另存为html压缩包的记录。其实分析word文档有很多的方法,但是将他转为html更为灵活一些。也希望有好方法的互相交流,可给我留言

 将word转html有很多的方法,我用过的最方便的是docx4j,这里不做记录。此处案例是上传并分析一套多元测评的案例,类似一套心理测试试卷。 

 下面是试卷简约样本1,第一步,上传并校验压缩包内容用jsoup解析word文档转变的html项目实例

一。上传并解压压缩包。

1.有一方法是解析压缩包,解析压缩包代码是

/**
	 * 针对word TO html 返回html路径
	 * 解压缩ZIP文件,将ZIP文件里的内容解压到descFileName目录下
	 * @param zipFileName 需要解压的ZIP文件
	 * @param descFileName 目标文件
	 */
	public static List<String>  unZipFilesReturnPath(String zipFileName, String descFileName) {
		//支持多个试卷上传html解压
		List<String> htmlFilePath=Lists.newArrayList();
		Set<String> filesFilePath=new HashSet<>();
		String descFileNames = descFileName;
		if (!descFileNames.endsWith(File.separator)) {
			descFileNames = descFileNames + File.separator;
		}		
        try {
			// 根据ZIP文件创建ZipFile对象
			ZipFile zipFile = new ZipFile(zipFileName,"GBK");
			ZipEntry entry = null;
			String entryName = null;
			String descFileDir = null;
			byte[] buf = new byte[4096];
			int readByte = 0;
			// 获取ZIP文件里所有的entry
			@SuppressWarnings("rawtypes")
			Enumeration enums = zipFile.getEntries();
			// 遍历所有entry
			while (enums.hasMoreElements()) {
				entry = (ZipEntry) enums.nextElement();
				// 获得entry的名字
				entryName = entry.getName();
				if((entryName.toLowerCase().contains(".htm") || entryName.toLowerCase().contains(".html")) && entryName.indexOf("/") == -1){
						htmlFilePath.add(descFileName+"/"+entryName);
		           }
				if(entryName.toLowerCase().contains(".files") && entryName.contains("/")){
					String[] split = entryName.split("/");
					filesFilePath.add(split[0]);
				}
				descFileDir = descFileNames + entryName;
				if (entry.isDirectory()) {
					// 如果entry是一个目录,则创建目录
					new File(descFileDir).mkdirs();
					continue;
				} else {
					// 如果entry是一个文件,则创建父目录
					new File(descFileDir).getParentFile().mkdirs();
				}
				File file = new File(descFileDir);
				// 打开文件输出流
				OutputStream os = new FileOutputStream(file);
				// 从ZipFile对象中打开entry的输入流
		        InputStream is = zipFile.getInputStream(entry);
				while ((readByte = is.read(buf)) != -1) {
					os.write(buf, 0, readByte);
				}
				os.close();
				is.close();
			}
			zipFile.close();
			
			if(filesFilePath.size() != htmlFilePath.size() ){
				return null;
			}
			logger.debug("文件解压成功!");
			return htmlFilePath;
		} catch (Exception e) {
			logger.debug("文件解压失败:" + e.getMessage());
			return null;
		}
	}

二。接下来就是解析html内容
//总方法
	@Transactional(readOnly = false)
	public  String totalAnalysisEvaluationHtml(MultiPaperVo multiPaperVo ,List<String> htmlUrlList) {
		try {
			if(multiPaperVo ==null || htmlUrlList == null || htmlUrlList.size() < 1 ){
				return "没有查询到文件信息";
			}
			//校验paper数据
			String checkPaperInfo = this.checkPaperInfo(multiPaperVo);
			if(StringUtils.isNotBlank(checkPaperInfo)){
				return checkPaperInfo;
			}
			//将paper信息存入
			QtPaper qtPaper = new QtPaper();			
			qtPaper = this.insertPaperInfo(multiPaperVo, qtPaper);
			
			//循环解析数据
			for(String  htmlUrl :  htmlUrlList){
				if(StringUtils.isBlank(htmlUrl)){
					continue;
				}
				//解析数据
				File input= new File(htmlUrl);
				//判断是否有文件
				if(!input.exists()){
					logger.debug("file is null:zipan bu dui");
					continue;
				}
				//解析文件
				Document  doc  = Jsoup.parse(input, "GBK", "");//input 要解析的文件   ,要解析文件的编码  ,假如没有文件的时候要解析的文件
				//获取文件内容
				Element body = doc.body();
				//处理图片带来的垃圾数据
				body=this.refiningBody(body);
				//校验图片信息
				String cheackImg = this.cheackImg(body);
				if(StringUtils.isNotBlank(cheackImg)){
					return cheackImg;
				}
				//分解信息
				List<Element> analysisQtPaper = this.analysisQtPaper(body);
				if(analysisQtPaper==null || analysisQtPaper.size() < 1){
					return "试卷中不存在<Ф试卷名称Ф>";
				}
				//得到解析的集合
				TestPaperVo vo = this.arrangePaper(analysisQtPaper);
				//校验数据
				String checkExamPaper = this.checkExamPaper( vo , analysisQtPaper , qtPaper);
				if(StringUtils.isNotBlank(checkExamPaper)){
					return checkExamPaper;
				}
				// 删除以前的图片
				String deletePic = this.deletePic(qtPaper);
				if(StringUtils.isNotBlank(deletePic)){
					logger.error("服务器删除图片出错,图片地址为:"+deletePic);
				}
				//t图片的替换  
				body = this.replaceImgs(body, htmlUrl,qtPaper);
				//判断是否有图片 //如果有
				Elements links = body.getElementsByTag("img");
				if(links != null && links.size()>0){
					analysisQtPaper = this.analysisQtPaper(body);
					vo = this.arrangePaper(analysisQtPaper);
				}
				//分解插入数据方法
				String insertExamPaperInfo = this.insertExamPaperInfo(vo,qtPaper);
				if(StringUtils.isNotBlank(insertExamPaperInfo)){
					return insertExamPaperInfo;
				}
			}
			return null;
		} catch (Exception e) {
			e.printStackTrace();
			return RestCode.FAILURE.getCode();
		}
	}
	
1.我们从解析文件开始说,解析文件就是将文件变为Document,方便后续文件的解析,其实从这开始就是一些业务的数据,然后是获取body的数据,因为我们要的就是文档的内容,其他的就不要了

2.其实文档有图片的话,源码中很多的垃圾解释,而且超级长,据说是ie浏览器低版本采用的东西,我们要将他去掉

	/**
	 * 处理图片带来的垃圾数据
	 * 
	 * @param body
	 * @return
	 */
	public Element refiningBody(Element body){
		Elements elist = body.getElementsByTag("span");
		for(Element e:elist){
			while(e.html().indexOf("<!--") !=-1){
				String str = e.html().substring(e.html().toString().indexOf("<!--"), e.html().toString().indexOf("-->")+3);
				e.html(	e.html().replace(str, ""));
			}
		}
		Elements plist = body.getElementsByTag("p");
		for(Element ep:plist){
			while(ep.html().indexOf("<!--") !=-1){
				String str = ep.html().substring(ep.html().toString().indexOf("<!--"), ep.html().toString().indexOf("-->")+3);
				ep.html(ep.html().replace(str, ""));
			}
		}
		return body;
	}

3.校验图片主要是校验外网图片,因为很多文档从网上直接下载的,图片是个链接,这样的我们暂时不要,假如你需要,校验出来后,你要去外网下载图片,然后上传

	/**
	 * 校验图片的方法
	 * 
	 * @param body
	 * @return
	 */
	public String cheackImg(Element body){
		Elements links = body.getElementsByTag("img");
		for (Element link : links) {
			String linkHref = link.attr("src");
			if(linkHref.contains("http") || linkHref.contains("https")){//如果是外网的绝对路径,不转直接存
				return "图片包含外网链接图片:"+linkHref;
			}
		}
		return null;
	}

4.分解数据,就是将他按照段落分解,然后存入集合,个人习惯,你也可以直接用
 
/**
	 * 分解题目信息
	 * 
	 * @param body
	 * @return
	 */
	public List<Element> analysisQtPaper(Element body){
		//获得第一个标签
		Element firstElement= body.getElementsContainingOwnText(LabelEnum.PAPERS_NAME.getDesc()).first();
		if(firstElement == null){
			return null;
		}
//		Elements el = firstElement.parents();
//		firstElement = el.first();
		//目前向上获取到p标签  
		boolean selectP = true;//是否向上
		while(selectP){
			if(!firstElement.tagName().equals("p")){
				firstElement = firstElement.parent();
			}else{
				selectP = false;
			}				
		}
		//寻找该标签下所有的兄弟标签
		boolean isAddFor = true;//是否有下一个兄弟标签
		List<Element> bodyElementList = new ArrayList<>();//用于存储所有的兄弟信息
		bodyElementList.add(firstElement);
		while (isAddFor) {
			firstElement = firstElement.nextElementSibling();
			if(firstElement == null){
				isAddFor=false;
			}else{
				bodyElementList.add(firstElement);
			}
		}
		return bodyElementList;
	}
	
*其中有一个是标签枚举类,这些事自定义标签

public enum LabelEnum {
	 PAPERS_NAME("Ф试卷名称Ф", "试卷名称"),
	 PAPERS_DESC("Ф试卷简介Ф", "试卷简介"),
	 PAPERS_DEMAND("Ф作答要求Ф", "作答要求"),
	 PAPERS_INFO_ONE("<Ⅰ卷>", "Ⅰ卷"),
	 PAPERS_INFO_TWO("<Ⅱ卷>", "Ⅱ卷"),
	 QUESTION_FATHER_CONTENT("Ф父题内容Ф", "父题内容"),
	 QUESTION_FATHER_CONTENT_END("Ф父题结束Ф", "父题结束"),
	 QUESTION_CONTENT("Ф题目内容Ф", "题目内容"),
	 QUESTION_MODULE("Ф试题所属模块Ф", "试题所属模块"),
	 QUESTION_SCORE("Ф分值Ф", "分值"),
	 QUESTION_SUBJECTIVE("Ф主观题Ф", "主观题"),
	 QUESTION_ANALYZE("Ф试题解析Ф", "试题解析"),
	 QUESTION_OPTION_APPRAISE("Ф选项评析Ф", "选项评析"),
	 QUESTION_ANSWER_APPRAISE("Ф答案评析Ф", "答案评析"),
	 QUESTION_CLASSIFICATION("Ф试题所属分类Ф", "试题所属分类"),
	 OPTION_COMMENTARY("<★>", "★");
		String key;
	    String desc;
	    LabelEnum(String key, String desc) {
	        this.key = key;
	        this.desc = desc;
	    }
	    public static LabelEnum getEnumByKey(String key) {
	    	LabelEnum[] enums = LabelEnum.values();
	        for (LabelEnum em : enums) {
	            if (em.getKey().equals(key)) {
	                return em;
	            }
	        }
	        return null;
	    }
		public String getKey() {
			return key;
		}
		public void setKey(String key) {
			this.key = key;
		}
		public String getDesc() {
			return desc;
		}
		public void setDesc(String desc) {
			this.desc = desc;
		}
}
5.分析数据,并将数据分类,代码中的标签内段落数差异很大,所以要做成这样

/**
	 * 分解数据
	 * 
	 * @param bodyList
	 * @return
	 */
	public  TestPaperVo  arrangePaper(List<Element>  bodyList){
		
		//试卷名称
		List<Element>  exNameList = new ArrayList<>();
		//试卷简介
		List<Element>  exDescList = new ArrayList<>();
		//作答要求
		List<Element>  exDemandList = new ArrayList<>();
		//卷体信息
		List<Element>  exInfoList = new ArrayList<>();
		List<Element>  exInfoNextList = new ArrayList<>();
		//题目信息
		List<List<Element>>  exQuestionList = new ArrayList<>();
		List<Element>  questionList = new ArrayList<>();
		//试题分值		
		List<Element>  questionScoreList = new ArrayList<>();
		//试题所属模块		
		List<Element>  questionMouduleList = new ArrayList<>();
		
		//试题数量
		Integer questionNum = 0;
		
		//设置
		Boolean isExName = Boolean.FALSE;
		Boolean isExDesc = Boolean.FALSE;
		Boolean isDemand = Boolean.FALSE;
		Boolean isExInfo = Boolean.FALSE;
		Boolean isNew = Boolean.FALSE;
		Boolean isAdd = Boolean.FALSE;
		Boolean isfather = Boolean.FALSE;
		Boolean isMoudule = Boolean.FALSE;
		int paperInfo=0;
		
		for(int f=0;f<  bodyList.size() ; f++){
			//是否是试卷名称
			if(bodyList.get(f).text().contains(LabelEnum.PAPERS_NAME.getKey())){
				isExName= Boolean.TRUE;
			}
			//是否是简介
			if(bodyList.get(f).text().contains(LabelEnum.PAPERS_DESC.getKey())){
				isExDesc= Boolean.TRUE;
				isExName= Boolean.FALSE;
				isDemand= Boolean.FALSE;
				isExInfo= Boolean.FALSE;
			}
			//是否是作答要求
			if(bodyList.get(f).text().contains(LabelEnum.PAPERS_DEMAND.getKey())){
				isDemand=Boolean.TRUE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isExInfo= Boolean.FALSE;
			}
			//卷体信息
			if(bodyList.get(f).text().contains(LabelEnum.PAPERS_INFO_ONE.getKey())||bodyList.get(f).text().contains(LabelEnum.PAPERS_INFO_TWO.getKey())){
				isExInfo = Boolean.TRUE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isDemand= Boolean.FALSE;
				paperInfo++;
			}
			//题目内容  
			//是否是父题
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_FATHER_CONTENT.getKey())){
				isNew =Boolean.TRUE;
				isAdd =Boolean.TRUE;
				isfather =Boolean.TRUE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isDemand= Boolean.FALSE;
				isExInfo= Boolean.FALSE;
			}
			//是父子题目的判断结尾
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_FATHER_CONTENT_END.getKey())){
				isfather =Boolean.FALSE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isDemand= Boolean.FALSE;
				isExInfo= Boolean.FALSE;
			}
			//不是父子题目
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_CONTENT.getKey()) && !isfather){
				isNew =Boolean.TRUE;
				isAdd =Boolean.TRUE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isDemand= Boolean.FALSE;
				isExInfo= Boolean.FALSE;
			}
			
			
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_MODULE.getKey()) ){
				isMoudule =Boolean.TRUE;
			}
			
			
			if(paperInfo>1&&(bodyList.get(f).text().contains(LabelEnum.PAPERS_INFO_ONE.getKey()) || bodyList.get(f).text().contains(LabelEnum.PAPERS_INFO_TWO.getKey()))){
				isNew =Boolean.TRUE;
				isAdd =Boolean.TRUE;
				isExName= Boolean.FALSE;
				isExDesc= Boolean.FALSE;
				isDemand= Boolean.FALSE;
//				isExInfo= Boolean.FALSE;
			}
			//根据以上判断是否加入
			if(isExName){
				exNameList.add(bodyList.get(f));
			}
			if(isExDesc){
				exDescList.add(bodyList.get(f));
			}
			if(isDemand){
				exDemandList.add(bodyList.get(f));
			}
			if(isExInfo){
				exInfoList.add(bodyList.get(f));
				exInfoNextList.add(bodyList.get(f+1));
			}
			if(isMoudule){
				questionMouduleList.add(bodyList.get(f));
				isMoudule = false;
			}
			if(isAdd){
				if(isNew){
					isNew=false;
					if(questionList.size()>0){
						exQuestionList.add(questionList);
					}
					questionList = new ArrayList<>();
				}
				questionList.add(bodyList.get(f));
				//如果是最后一个,要判断是否要加、
				if(f>(bodyList.size()-2)){
					
					if(questionList.size()>0){
						exQuestionList.add(questionList);
					}
				}
			}
			//试题数量统计
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_CONTENT.getKey())){
				questionNum++;
			}
			//试题分值项目
			if(bodyList.get(f).text().contains(LabelEnum.QUESTION_SCORE.getKey())){
				questionScoreList.add(bodyList.get(f));
			}
		}
		//添加返回信息
		TestPaperVo vo = new TestPaperVo();
		vo.setExNameList(exNameList);
		vo.setExDescList(exDescList);
		vo.setExDemandList(exDemandList);
		vo.setExInfoList(exInfoList);
		vo.setExQuestionList(exQuestionList);
		vo.setQuestionNum(questionNum);
		vo.setQuestionScoreList(questionScoreList);
		vo.setExInfoNextList(exInfoNextList);
		vo.setQuestionMouduleList(questionMouduleList);
		return vo;
	}
	

6.检验此处不错解释,业务需求

	public String checkExamPaper(TestPaperVo vo ,List<Element> analysisQtPaper ,QtPaper qtPaper){
		//验证父题的对应关系
		String checkFatherLabel = this.checkFatherLabel(analysisQtPaper);
		if(StringUtils.isNotBlank(checkFatherLabel)){
			if(checkFatherLabel.length()>50){
				return "<Ф父题内容Ф>和<Ф父题结束Ф>标签不对应,出错原文内容:"+checkFatherLabel.substring(0, 50);
			}else{
				return "<Ф父题内容Ф>和<Ф父题结束Ф>标签不对应,出错原文内容:"+checkFatherLabel;
			}
		}
		//校验标签数量
		String checkHtmlLabelNum = this.checkHtmlLabelNum(analysisQtPaper);
		if(StringUtils.isNotBlank(checkHtmlLabelNum)){
//			return "重复标签:"+checkHtmlLabelNum;
			if(checkHtmlLabelNum.length()>50){
				return "同行内出现多个标签,出错原文内容:"+checkHtmlLabelNum.substring(0, 50);
			}else{
				return "同行内出现多个标签,出错原文内容:"+checkHtmlLabelNum;
			}
		}
		//校验标签和规范
		String checkHtmlLabelNorm = this.checkHtmlLabelNorm(analysisQtPaper);
		if(StringUtils.isNotBlank(checkHtmlLabelNorm)){
//			return "标签不规范:"+checkHtmlLabelNorm;
			if(checkHtmlLabelNorm.length()>50){
				return 	"试卷中有标签错误,错误原文内容:"+checkHtmlLabelNorm.substring(0,50);
			}else{
				return 	"试卷中有标签错误,错误原文内容:"+checkHtmlLabelNorm;
			}
		}
		//校验分数
		String checkExamScore = this.checkExamScore(qtPaper,vo);
		if(StringUtils.isNotBlank(checkExamScore)){
//			return checkExamScore;
			return "试卷分值总分和小题分值之和不等";
		}
		//校验数量
		if(qtPaper.getQuestionCount() != vo.getQuestionNum()){
//			return "页面填写题目数量与页面题目数量不相等:页面填写题目数量:"+qtPaper.getQuestionCount()+"试卷题目数量"+vo.getQuestionNum();
			return "试题数目与<Ф题目内容Ф>标签总数不等";
			
		}
		//校验卷体信息
		if(vo.getExInfoList().size() > 0){
			//标签要么没有,要么成对出现
			if(vo.getExInfoList().size() != 2){
				return "<Ⅰ卷>和<Ⅱ卷>标签不对应";	
			}
			if(vo.getExInfoNextList().size()<1 || !vo.getExInfoNextList().get(0).text().equals(vo.getExQuestionList().get(0).get(0).text()) ){
//				return "卷体信息只能写于第一个题目内容前";
				return "<Ⅰ卷>和<Ⅱ卷>标签不对应";	
			}
			if(vo.getExInfoNextList().size()==2){
				if(!vo.getExInfoNextList().get(1).text().contains(LabelEnum.QUESTION_FATHER_CONTENT.getKey()) && !vo.getExInfoNextList().get(1).text().contains(LabelEnum.QUESTION_CONTENT.getKey()) ){
					return "<Ⅰ卷>和<Ⅱ卷>标签不对应";	
				}
			}
		}
		//卷中题的属性数目必须等于字典中定义的卷属性数目
		String checkMudule = checkMudule(vo,qtPaper);
		if(null!= checkMudule){
			return checkMudule;
		}
		
		//校验题目规范
		String checkExamTopic = this.checkExamTopic(vo);
		if(StringUtils.isNotBlank(checkExamTopic)){
			return checkExamTopic;
		}
		return null;
	}

7.删除照片是这样的,一种类型的卷子只能有一套,这样来说的话,你要上传就要把以前的照片删除,自我判断是否需要

/**
	 * 删除以前所有的图片
	 * 
	 * @param qtPaper
	 * @return
	 */
	public String deletePic(QtPaper qtPaper){
		UploadFileUtil uploadFileUtil  =  SpringContextHolder.getBean(UploadFileUtil.class);
		String paperPice =	qtPaper.getGrade()+"_"+qtPaper.getClassDivideType()+"_"+qtPaper.getTestType();
		String imagePath = uploadFileUtil.getImagePath(UploadFileUtil.FileType.paper, "temc", paperPice);//要删除的路径
		//截取图片地址
		String[] split = imagePath.split(paperPice);
		imagePath = split[0]+paperPice;
		//如果删除成功返回空,如果失败返回删除路径
		//删除的方法
		boolean deleteDir = FileUtils.deleteDir(new File(imagePath));
		if(deleteDir){
			return null;
		}else{
			return imagePath;
		}
	}

8.替换图片主要作用是将上传的图片地址重新放入文档内容中,并将3k以下变为64的码

	/**
	 * 替换图片
	 * @param body
	 * @param htmlUrl
	 * @return
	 */
	public Element replaceImgs(Element body,String htmlUrl,QtPaper qtPaper ){
	String paperPice =	qtPaper.getGrade()+"_"+qtPaper.getClassDivideType()+"_"+qtPaper.getTestType();
		
		Elements links = body.getElementsByTag("img");
		UploadFileUtil uploadFileUtil  =  SpringContextHolder.getBean(UploadFileUtil.class);
		//在此处获取所有的图片   可将图片替换
		for (Element link : links) {
			String linkHref = link.attr("src");
				String fileName =uploadFileUtil.getImagePath(UploadFileUtil.FileType.paper,  "temc", paperPice, IdGen.uuid() +linkHref.substring(linkHref.lastIndexOf("/")+1,linkHref.length()));
				//把文件从临时目录拷贝到正式目录,超过3k文件保存,否则base64存储
				FileUtils.copyFile(htmlUrl.substring(0, htmlUrl.lastIndexOf("/")+1)+linkHref, fileName);
				String picurl = uploadFileUtil.getImageUrl(fileName);
				linkHref = FileUtils.getImageStr(fileName);
				if(linkHref.equals("0")){
					link.attr("src",picurl);
				}else{
					link.attr("src",linkHref);
				}
		}
		
		return body;
	}
	 /**
	  * @Description: 根据图片地址转换为base64编码字符串
	  * @Author: 
	  * @CreateTime: 
	  * @return
	  */
	 public static String getImageStr(String imgFile) {
		 String imageString="data:image/png;base64,";//拼上base64显示标签
	     InputStream inputStream = null;
	     byte[] data = null;
	     try {
	         inputStream = new FileInputStream(imgFile);
	         data = new byte[inputStream.available()];
	         inputStream.read(data);
	         inputStream.close();
	     } catch (IOException e) {
	         e.printStackTrace();
	     }
	     // 加密
	     imageString =imageString+ new String(Base64.encodeBase64(data));
	     return imageString;
	 }


9.因为上文将该类下图片全部删掉了,所以本文由图片就去重新分析一下,具体为什么,记不清了

10。分析插入数据。只贴代码,不做解释,待会贴几个方法

/**
	 * 插入数据信息
	 * 
	 * @param vo
	 */
	private String insertExamPaperInfo(TestPaperVo vo,QtPaper qtPaper) {
		//paper信息
		List<String> labelList = new ArrayList<>();
		if(vo.getExNameList() != null  && vo.getExNameList().size()>0){
			labelList.add("<"+LabelEnum.PAPERS_NAME.getKey()+">");
			qtPaper.setPaperName(this.deleteLabelString(vo.getExNameList(),labelList));//试卷名称
		}
		qtPaper.setClassify(Constants.TEST_ASSESSMENT_CLASSIFY_MULTI);//测评分类
		qtPaper.setPaperState(Constants.UNPUBLISH_PAPER_STATE);//是否发布
		qtPaper.setDelFlag(Constants.DEL_FLAG_NORMAL);//是否删除
		if(vo.getExDescList() != null  && vo.getExDescList().size()>0){
			labelList.clear();
			labelList.add("<"+LabelEnum.PAPERS_DESC.getKey()+">");
			qtPaper.setDescription(this.deleteLabelString(vo.getExDescList(),labelList));//简介
		}
		if(vo.getExDemandList() != null && vo.getExDemandList().size() > 0){
			labelList.clear();
			labelList.add("<"+LabelEnum.PAPERS_DEMAND.getKey()+">");
			qtPaper.setDemand(this.deleteLabelString(vo.getExDemandList(),labelList));//简介
		}
		//循环得到题目具体信息
		TransferVo tVo= new TransferVo();
		if(vo.getExInfoList()!= null && vo.getExInfoList().size() >0 ){
			if(vo.getExInfoList().get(0).text().contains(LabelEnum.PAPERS_INFO_ONE.getKey())  ){
				tVo.setPaperInfo(Constants.PAPER_INFO_ONE);
			}else{
				tVo.setPaperInfo(Constants.PAPER_INFO_TWO);
			}
		}
		//取卷体信息第一个作为卷体信息
		List<QtQuestion> questions = new ArrayList<>();
		for(List<Element>  elListFor :  vo.getExQuestionList()){
			//判断是否包含卷体信息
			if(elListFor.get(0).text().contains(LabelEnum.PAPERS_INFO_ONE.getKey()) || elListFor.get(0).text().contains(LabelEnum.PAPERS_INFO_TWO.getKey()) ){
				if(elListFor.get(0).text().contains(LabelEnum.PAPERS_INFO_ONE.getKey())){
					tVo.setPaperInfo(Constants.PAPER_INFO_ONE);
				}else{
					tVo.setPaperInfo(Constants.PAPER_INFO_TWO);
				}
				continue;
			}
			//判断是否是父子题
			if(elListFor.get(0).text().contains(LabelEnum.QUESTION_FATHER_CONTENT.getKey()) ){
				QtQuestion combingFatherTopic = this.combingFatherTopic(elListFor,  tVo);
				questions.add(combingFatherTopic);
			}else{//单体
				QtQuestion combingAloneTopic = this.combingAloneTopic(elListFor, tVo, null, Constants.Level_SINGLE);
				questions.add(combingAloneTopic);
			}
		}
		qtPaper.setQuestions(questions);
		//调用插入方法
		boolean savePaperWithQuestions = ptPaperService.saveWithQuestions(qtPaper); 
		if(!savePaperWithQuestions){
			return "信息入库时失败,请稍后重试!";
		}
		return null;
	}
	

/**
	 * 父子题目的方法
	 * @param elListFor
	 * @param tVo
	 * @return
	 */
	public QtQuestion combingFatherTopic(List<Element> elListFor, TransferVo tVo){
		//先得到父题内容
		boolean isFatherTitle = false;
		List<Element> fatherTitleList = new ArrayList<>();
		//试题解析
		boolean isFatherAnalyze = false;
		List<Element> fatherAnalyzeList = new ArrayList<>();
		//所有小题目
		boolean isQuestion = false;
		boolean isNew = false;
		List<List<Element>> questionList = new ArrayList<>();
		//单个题目
		List<Element> questionSingleList = new ArrayList<>();
		for(Element  elFor : elListFor){
			if(elFor.text().contains(LabelEnum.QUESTION_FATHER_CONTENT.getKey())){
				isFatherTitle=true;
			}
			if(!isQuestion && elFor.text().contains(LabelEnum.QUESTION_ANALYZE.getKey()) ){
				isFatherAnalyze=true;
				isFatherTitle=false;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_CONTENT.getKey())){
				isNew =true;
				isQuestion=true;
				isFatherTitle=false;
				isFatherAnalyze=false;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_FATHER_CONTENT_END.getKey())){
				if(questionSingleList.size()>0){
					questionList.add(questionSingleList);
				}
				isQuestion=false;
			}
			if(isFatherTitle){
				fatherTitleList.add(elFor);
			}
			if(isFatherAnalyze){
				fatherAnalyzeList.add(elFor);
			}
			if(isQuestion){
				if(isNew){
					isNew=false;
					if(questionSingleList.size()>0){
						questionList.add(questionSingleList);
					}
					questionSingleList = new ArrayList<>();
				}
				questionSingleList.add(elFor);
			}
		}
		//梳理父子题
		List<String> labellist =new ArrayList<>();
		labellist.add(LabelEnum.QUESTION_FATHER_CONTENT.getKey());
		String  description  =this.deleteLabelString(fatherTitleList  , labellist);
		//父标题
		QtQuestion insertQtQuestion = this.insertQtQuestion( tVo,null, description, null,"",Constants.Level_ParentChild_PARENT,0);
		List<QtQuestionAttribute> attributes =  new ArrayList<>();   // 试题属性
		List<QtQuestion> children  =  new ArrayList<>();              // 该题的子题,注意设置对应的 level
		//判断是否有解析
		if(fatherAnalyzeList.size() > 0 ){
			labellist.clear();
			labellist.add(LabelEnum.QUESTION_ANALYZE.getKey());
			QtQuestionAttribute insertQtQuestionAttribute = this.insertQtQuestionAttribute("stsxAnalysis", this.deleteLabelString(fatherAnalyzeList  , labellist));
			attributes.add(insertQtQuestionAttribute);
			insertQtQuestion.setAttributes(attributes);
		}
		for(int f=0 ; f<questionList.size() ; f++){
			QtQuestion combingAloneTopic = combingAloneTopic(questionList.get(f), tVo, f+1,Constants.Level_ParentChild_CHILD);
			children.add(combingAloneTopic);
		}
		insertQtQuestion.setChildren(children);
		return insertQtQuestion;
	}
	
	/**
	 * 单体题目的方法
	 * 
	 * @param elListFor
	 * @param tVo
	 * @param orderNumber 父子题目的时候才传递
	 * @param level 等级
	 */
	public QtQuestion combingAloneTopic(List<Element>  elListFor ,TransferVo tVo,Integer orderNumber,String level){
		if(elListFor == null || elListFor.size() < 1){
			return null;
		}
		//设置题目内容集合
		Boolean isTitleContent =Boolean.FALSE;
		List<Element> titleContentList = new ArrayList<>();
		//所属模块
		Boolean isModule =Boolean.FALSE;
		List<Element> moduleList = new ArrayList<>();
		//试题所属分类
		Boolean isClassification =Boolean.FALSE;
		List<Element> classificationList = new ArrayList<>();
		//分值
		Boolean isScore =Boolean.FALSE;
		List<Element> scoreList = new ArrayList<>();
		//主客观
		Boolean isObjective =Boolean.TRUE;//客观
		//试题解析
		Boolean isQuestionAnalyze =Boolean.FALSE;
		List<Element> questionAnalyzeList = new ArrayList<>();
		//选项评析
		Boolean isQuestionOptionAppraise =Boolean.FALSE;
		List<Element> questionOptionAppraiseList = new ArrayList<>();
		//答案评析
		Boolean isQuestionAnswerAppraise =Boolean.FALSE;
		List<Element> questionAnswerAppraiseList = new ArrayList<>();
		//循环数据
		for(Element  elFor : elListFor){
			if(elFor.text().contains(LabelEnum.QUESTION_CONTENT.getKey())){
				isTitleContent =Boolean.TRUE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_MODULE.getKey())){
				isModule =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_CLASSIFICATION.getKey())){
				isClassification =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_SCORE.getKey())){
				isScore =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_SUBJECTIVE.getKey())){
				isObjective =Boolean.FALSE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_ANALYZE.getKey())){
				isQuestionAnalyze =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_OPTION_APPRAISE.getKey())){
				isQuestionOptionAppraise =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionAnswerAppraise =Boolean.FALSE;
			}
			if(elFor.text().contains(LabelEnum.QUESTION_ANSWER_APPRAISE.getKey())){
				isQuestionAnswerAppraise =Boolean.TRUE;
				isTitleContent =Boolean.FALSE;
				isModule =Boolean.FALSE;
				isClassification =Boolean.FALSE;
				isScore =Boolean.FALSE;
				isQuestionAnalyze =Boolean.FALSE;
				isQuestionOptionAppraise =Boolean.FALSE;
			}
			if(isTitleContent){
				titleContentList.add(elFor);
			}
			if(isModule){
				moduleList.add(elFor);
			}
			if(isClassification){
				classificationList.add(elFor);
			}
			if(isScore){
				scoreList.add(elFor);
			}
			if(isQuestionAnalyze){
				questionAnalyzeList.add(elFor);
			}
			if(isQuestionOptionAppraise){
				questionOptionAppraiseList.add(elFor);
			}
			if(isQuestionAnswerAppraise){
				questionAnswerAppraiseList.add(elFor);
			}
		}
		//获得数据信息
		List<String> labellist =new ArrayList<>();
		//题目
		labellist.add(LabelEnum.QUESTION_CONTENT.getKey());
		String  description  =this.deleteLabelString(titleContentList  , labellist);
		//模块
		labellist.clear();
		labellist.add(LabelEnum.QUESTION_MODULE.getKey());
		String  module  =this.deleteLabelString(moduleList  , labellist);
		//分类
		labellist.clear();
		labellist.add(LabelEnum.QUESTION_CLASSIFICATION.getKey());
		String  classification  =this.deleteLabelString(classificationList  , labellist);
		//分值
		List<Integer> elementScore = this.getElementScore(scoreList.get(0).text());
		Integer topScore =0;
		if(elementScore.size()>1){
			for(Integer  doFor : elementScore){
				if(topScore < doFor){
					topScore=doFor;
				}
			}
		}else{
			topScore =elementScore.get(0);
		}
		//试题解析
		String  questionAnalyze ="";
		if(questionAnalyzeList.size()>0){
			labellist.clear();
			labellist.add(LabelEnum.QUESTION_ANALYZE.getKey());
			questionAnalyze  =this.deleteLabelString(questionAnalyzeList  , labellist);
		}
		//答案
		String  questionAnswerAppraise = "";
		if(questionAnswerAppraiseList.size()>0){
			labellist.clear();
			labellist.add(LabelEnum.QUESTION_ANSWER_APPRAISE.getKey());
			questionAnswerAppraise  =this.deleteLabelString(questionAnswerAppraiseList  , labellist);
		}
		
		//开始对表
		//QtQuestion
		QtQuestion insertQtQuestion = this.insertQtQuestion(tVo, isObjective?Constants.QuestionTypeChoice:Constants.QuestionTypeEssay, description, orderNumber,"", level, topScore);
		//QtQuestionAttribute  //四个
		List<QtQuestionAttribute> attributes = new ArrayList<>();   // 试题属性
		if(questionAnalyzeList.size()>0){
			QtQuestionAttribute insertQtQuestionAttribute = this.insertQtQuestionAttribute("stsxAnalysis", questionAnalyze);//试题解析
			attributes.add(insertQtQuestionAttribute);
		}
		if(questionAnswerAppraiseList.size()>0){
			QtQuestionAttribute insertQtQuestionAttribute2 = this.insertQtQuestionAttribute("stsxEvaluate", questionAnswerAppraise);//答案评析
			attributes.add(insertQtQuestionAttribute2);
		}
		if(StringUtils.isNotBlank(module)){
			Integer modules = charAtStringReturnDouble(module);
			if(null != modules){
				QtQuestionAttribute insertQtQuestionAttribute3 = this.insertQtQuestionAttribute("stsxModule", modules.toString());//所属模块
				attributes.add(insertQtQuestionAttribute3);
			}
		}
		if(StringUtils.isNotBlank(classification)){
			QtQuestionAttribute insertQtQuestionAttribute4 = this.insertQtQuestionAttribute("stsxType", classification);//所属分类
			attributes.add(insertQtQuestionAttribute4);
		}
		//insertQtQuestionChoice
		if(isObjective){
			List<QtQuestionAnswer> choices = this.insertQtQuestionChoice(questionOptionAppraiseList, elementScore, topScore);
			insertQtQuestion.setAnswers(choices);
		}
		//返回类信息
		
		insertQtQuestion.setAttributes(attributes);
		return insertQtQuestion;
	}

11.下面贴几个自己写的方法类
/**
	 * 去除标签的方法  string删除标签 
	 * 
	 * @param elementList
	 * @param labellist
	 * @return
	 */
	public  String deleteLabelString(List<Element> elementList  ,List<String> labellist){
		String returnData ="";
		//判断是否为空
		if(elementList==null || elementList.size()<1 || labellist==null || labellist.size()<1){
			return returnData;
		}
		boolean isTitle = false;
		for(String leFor :  labellist){
			if(leFor.contains(LabelEnum.PAPERS_NAME.getKey()) || leFor.contains(LabelEnum.QUESTION_MODULE.getKey())|| leFor.contains(LabelEnum.QUESTION_CLASSIFICATION.getKey())){
				isTitle = true;
			}
		}
		
		for(Element   elFor :  elementList){
			String resultData =null;
			//对图片进行内容处理
			boolean isConstains = false;
			for(String laFor :   labellist){
				if(elFor.text().contains(laFor)){
					isConstains =true;
				}
			}
			if(isConstains){
				//删除第一个>以前的东西
				String[] split = elFor.text().split(">");
				int length = split[0].length();
				if(elFor.text().length()>length){
					resultData = elFor.text().substring((length+1), elFor.text().length());
				}
			}else{
				resultData = elFor.text();
			}
			//判断是否含有图片
			Elements elementsByTag = elFor.getElementsByTag("img");
			if(elementsByTag.size()>0){//有图片
				//判断是不是只有图片
				boolean isNull= true;
				if(StringUtils.isNotBlank(resultData)){//不是空
					char[] charArray = resultData.toCharArray();
					charArrayFor:
					for(char  charFor : charArray){
						if(!Character.isSpaceChar(charFor) ){
							isNull=false;
							break   charArrayFor;
						}
					}
				}
				if(isNull){//只有图片的
					returnData=returnData+"<p>";
					for(Element  imgFor :   elementsByTag){
						returnData=returnData+imgFor.toString();
					}
					returnData=returnData+"</p>";
				}else{//图片和文字都有的
					//将图片处替换
					for(int i =0;i<elementsByTag.size() ;i++){
						Element parent = elementsByTag.get(i).parent();
						parent.appendElement("span").text("<Ф图片"+i+"Ф>");
//						elementsByTag.get(i).html("<Ф图片"+i+"Ф>");
					}	
					//将el变文字
					if(isConstains){
						//删除第一个>以前的东西
						String[] split = elFor.text().split(">");
						int length = split[0].length();
						if(elFor.text().length()>length){
							resultData = elFor.text().substring((length+1), elFor.text().length());
						}
					}else{
						resultData = elFor.text();
					}
					//将img去替换
					for(int i =0;i<elementsByTag.size() ;i++){
						resultData = resultData.replaceAll("<Ф图片"+i+"Ф>", elementsByTag.get(i).toString());
					}
					returnData=returnData+"<P>"+resultData+"</P>";
				}
			}else{//没有图片
				if(StringUtils.isNotBlank(resultData)){
					if(isTitle){
						returnData=returnData+resultData;
					}else{
						returnData=returnData+"<P>"+resultData+"</P>";
					}
				}
				
			}
		}
		return returnData;
	}

此方法中,判断有图片后的for循环方法,charArrayFor,其实是判断是否有中文空格啥的

/**
	 * 分解字符串得到数字和.的方法
	 * 
	 * @param str
	 * @return
	 */
	public  Integer charAtStringReturnDouble(String str){
		String returnStr = new String();
		str = str.trim();
		Pattern p=Pattern.compile("(\\d+\\.\\d+)");
	      Matcher m=p.matcher(str);
	      if(m.find()){
	    	  returnStr=m.group(1);
	      }else{
	         p= Pattern.compile("(\\d+)");
	         m=p.matcher(str);
	         if(m.find()){
	        	 returnStr=m.group(1);
	         }
	      }
	      if(StringUtils.isNotBlank(returnStr)){
	    	 return    Integer.parseInt(returnStr);
	      }else{
	    	  return null;
	      }
	}
	

先写到这,下片文章,写一下关于jsoup的方法,假如以后用,能用到嫩