java图片识别文字的方法

时间:2022-08-30 16:28:53

java文字识别程序的关键是寻找一个可以调用的ocr引擎。tesseract-ocr就是一个这样的ocr引擎,在1985年到1995年由hp实验室开发,现在在google。tesseract-ocr 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的freeocr图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有*的中文ocr软件了。

java中使用tesseract-ocr3.01的步骤如下:

1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)

2.在安装向导中可以选择需要下载的语言包。

3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar

4.java程序清单:

imageiohelper 类:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import java.awt.image.bufferedimage;
import java.io.file;
import java.io.ioexception;
import java.util.iterator;
import java.util.locale;
 
import javax.imageio.iioimage;
import javax.imageio.imageio;
import javax.imageio.imagereader;
import javax.imageio.imagewriteparam;
import javax.imageio.imagewriter;
import javax.imageio.metadata.iiometadata;
import javax.imageio.stream.imageinputstream;
import javax.imageio.stream.imageoutputstream;
 
import com.sun.media.imageio.plugins.tiff.tiffimagewriteparam;
 
public class imageiohelper {
  
 public static file createimage(file imagefile, string imageformat) {
  file tempfile = null;
  try {
   iterator readers = imageio.getimagereadersbyformatname(imageformat);
   imagereader reader = readers.next();
   
   imageinputstream iis = imageio.createimageinputstream(imagefile);
   reader.setinput(iis);
   //read the stream metadata
   iiometadata streammetadata = reader.getstreammetadata();
    
   //set up the writeparam
   tiffimagewriteparam tiffwriteparam = new tiffimagewriteparam(locale.chinese);
   tiffwriteparam.setcompressionmode(imagewriteparam.mode_disabled);
    
   //get tif writer and set output to file
   iterator writers = imageio.getimagewritersbyformatname("tiff");
   imagewriter writer = writers.next();
    
   bufferedimage bi = reader.read(0);
   iioimage image = new iioimage(bi,null,reader.getimagemetadata(0));
   tempfile = tempimagefile(imagefile);
   imageoutputstream ios = imageio.createimageoutputstream(tempfile);
   writer.setoutput(ios);
   writer.write(streammetadata, image, tiffwriteparam);
   ios.close();
    
   writer.dispose();
   reader.dispose();
    
  } catch (ioexception e) {
   e.printstacktrace();
  }
  return tempfile;
 }
 
 private static file tempimagefile(file imagefile) {
  string path = imagefile.getpath();
  stringbuffer strb = new stringbuffer(path);
  strb.insert(path.lastindexof('.'),0);
  return new file(strb.tostring().replacefirst("(?<=//.)(//w+)$", "tif"));
 }
 
}

ocr 类:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
package com.hhp.util;
 
import java.io.bufferedreader;
import java.io.file;
import java.io.fileinputstream;
import java.io.inputstreamreader;
import java.util.arraylist;
import java.util.list;
import org.jdesktop.swingx.util.os;
 
public class ocr {
 private final string lang_option = "-l"; //英文字母小写l,并非数字1
 private final string eol = system.getproperty("line.separator");
 private string tesspath = "c://program files (x86)//tesseract-ocr";
 //private string tesspath = new file("tesseract").getabsolutepath();
  
 public string recognizetext(file imagefile,string imageformat)throws exception{
  file tempimage = imageiohelper.createimage(imagefile,imageformat);
  file outputfile = new file(imagefile.getparentfile(),"output");
  stringbuffer strb = new stringbuffer();
  list cmd = new arraylist();
  if(os.iswindowsxp()){
   cmd.add(tesspath+"//tesseract");
  }else if(os.islinux()){
   cmd.add("tesseract");
  }else{
   cmd.add(tesspath+"//tesseract");
  }
  cmd.add("");
  cmd.add(outputfile.getname());
  cmd.add(lang_option);
  cmd.add("chi_sim");
  //cmd.add("eng");
   
  processbuilder pb = new processbuilder();
  pb.directory(imagefile.getparentfile());
   
  cmd.set(1, tempimage.getname());
  pb.command(cmd);
  pb.redirecterrorstream(true);
   
  process process = pb.start();
  //tesseract.exe 1.jpg 1 -l chi_sim
  int w = process.waitfor();
   
  //删除临时正在工作文件
  tempimage.delete();
   
  if(w==0){
   bufferedreader in = new bufferedreader(new inputstreamreader(new fileinputstream(outputfile.getabsolutepath()+".txt"),"utf-8"));
    
   string str;
   while((str = in.readline())!=null){
    strb.append(str).append(eol);
   }
   in.close();
  }else{
   string msg;
   switch(w){
    case 1:
     msg = "errors accessing files.there may be spaces in your image's filename.";
     break;
    case 29:
     msg = "cannot recongnize the image or its selected region.";
     break;
    case 31:
     msg = "unsupported image format.";
     break;
    default:
     msg = "errors occurred.";
   }
   tempimage.delete();
   throw new runtimeexception(msg);
  }
  new file(outputfile.getabsolutepath()+".txt").delete();
  return strb.tostring();
 }
}

测试类testocr :

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import java.io.file;
import java.io.ioexception;
 
import com.hhp.util.ocr;
 
public class ocrtest {
 
 public static void main(string[] args) {
  string path = "c://temp//ocrcode//4.png"
  system.out.println("orc test begin......");
  try
   string valcode = new ocr().recognizetext(new file(path), "png"); 
   system.out.println(valcode); 
  } catch (ioexception e) { 
   e.printstacktrace(); 
  } catch (exception e) {
   e.printstacktrace();
  }  
  system.out.println("orc test end......");
 }
 
}

经过测试,tesseract-ocr 3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

原文链接:https://blog.csdn.net/tiny_lxf/article/details/78533485