centos上 java使用Tesseract进行ocr识别

时间:2022-03-24 09:01:45

1、安装过程:

安装ocr

yum install tesseract-ocr

查找中文包
yum search tesseract-ocr | grep sim

安装中文包
yum install   tesseract-langpack-chi_sim

 

安装版本信息:

? test-ugc-api01 tesseract tesseract -v
tesseract 3.04.00
leptonica-1.72
  libgif 4.1.6(?) : libjpeg 6b (libjpeg-turbo 1.2.90) : libpng 1.5.13 : libtiff 4.0.3 : zlib 1.2.7 : libwebp 0.3.0

 

2、java开发

注意版本匹配:3.04.00,采用tess4j

  
  
  
< dependency >
< groupId > net.sourceforge.tess4j </ groupId >
< artifactId > tess4j </ artifactId >
< version > 3.0 . 0 </ version >
</ dependency >

 

简单测试代码

  
  
  
public String ocr(String url) {
String datapath
= " /usr/share/tesseract/ " ;
String language
= " chi_sim " ;

// 进行相关的检测
try {
url
= url.trim();
System.
out .println( " url is: " + url);
URL targetUrl
= new URL(url);
BufferedImage image
= ImageIO.read(targetUrl);
ByteBuffer buf
= ImageIOHelper.convertImageData(image);
int bpp = image.getColorModel().getPixelSize();
int bytespp = bpp / 8 ;
int bytespl = ( int ) Math.ceil(image.getWidth() * bpp / 8.0 );
System.
out .println( " bpp is: " + bpp + " ;bytespp is: " + bytespp + " ;bytespl is: " + bytespl);

// 初始化
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
TessAPI1.TessBaseAPIInit3(handle, datapath, language);

TessAPI1.TessBaseAPISetPageSegMode(handle, ITessAPI.TessPageSegMode.PSM_AUTO);
Pointer utf8Text
= TessAPI1.TessBaseAPIRect(handle, buf, bytespp, bytespl, 0 , 0 , image.getWidth(), image.getHeight());
String result
= utf8Text.getString( 0 );

TessAPI1.TessDeleteText(utf8Text);
TessAPI1.TessBaseAPIDelete(handle);
System.
out .println( " ============================================== " );
System.
out .println( " result is: " + result);
System.
out .println( " ============================================== " );
if (result.equalsIgnoreCase( "" )){
System.
out .println( " no detected words!! " );
}
return result;
}
catch (Exception ex){
ex.printStackTrace();
}
return " no detected words!! " ;
}

注意:datapath要设置在tessdata的上一级目录

3、yum安装所在目录查询相关命令

  
  
  
#查询相关包
test
- ugc - api01 tesseract rpm - qa | grep tesseract
tesseract
- langpack - chi_sim - 3.04 . 00 - 3 .el7.noarch
tesseract
- 3.04 . 00 - 3 .el7.x86_64

#查询包具体安装位置
test
- ugc - api01 tesseract rpm - ql tesseract - 3.04 . 00 - 3 .el7.x86_64
/ usr / bin / ambiguous_words
/ usr / bin / classifier_tester
/ usr / bin / cntraining
/ usr / bin / combine_tessdata
/ usr / bin / dawg2wordlist
/ usr / bin / mftraining
/ usr / bin / set_unicharset_properties
/ usr / bin / shapeclustering
/ usr / bin / tesseract
/ usr / bin / text2image
/ usr / bin / unicharset_extractor
/ usr / bin / wordlist2dawg
/ usr / lib64 / libtesseract.so. 3
/ usr / lib64 / libtesseract.so. 3.0 . 4
/ usr / share / doc / tesseract - 3.04 . 00
/ usr / share / doc / tesseract - 3.04 . 00 / AUTHORS
/ usr / share / doc / tesseract - 3.04 . 00 / ChangeLog
/ usr / share / doc / tesseract - 3.04 . 00 / NEWS
/ usr / share / doc / tesseract - 3.04 . 00 / README
/ usr / share / doc / tesseract - 3.04 . 00 / eurotext.tif
/ usr / share / doc / tesseract - 3.04 . 00 / phototest.tif
/ usr / share / licenses / tesseract - 3.04 . 00
/ usr / share / licenses / tesseract - 3.04 . 00 / COPYING
/ usr / share / man / man1 / ambiguous_words. 1 .gz
/ usr / share / man / man1 / cntraining. 1 .gz
/ usr / share / man / man1 / combine_tessdata. 1 .gz
/ usr / share / man / man1 / dawg2wordlist. 1 .gz
/ usr / share / man / man1 / mftraining. 1 .gz
/ usr / share / man / man1 / shapeclustering. 1 .gz
/ usr / share / man / man1 / tesseract. 1 .gz
/ usr / share / man / man1 / unicharset_extractor. 1 .gz
/ usr / share / man / man1 / wordlist2dawg. 1 .gz
/ usr / share / man / man5 / unicharambigs. 5 .gz
/ usr / share / man / man5 / unicharset. 5 .gz
/ usr / share / tesseract
/ usr / share / tesseract / tessdata
/ usr / share / tesseract / tessdata / configs
/ usr / share / tesseract / tessdata / configs / ambigs.train
/ usr / share / tesseract / tessdata / configs / api_config
/ usr / share / tesseract / tessdata / configs / bigram
/ usr / share / tesseract / tessdata / configs / box.train
/ usr / share / tesseract / tessdata / configs / box.train.stderr
/ usr / share / tesseract / tessdata / configs / digits
/ usr / share / tesseract / tessdata / configs / hocr
/ usr / share / tesseract / tessdata / configs / inter
/ usr / share / tesseract / tessdata / configs / kannada
/ usr / share / tesseract / tessdata / configs / linebox
/ usr / share / tesseract / tessdata / configs / logfile
/ usr / share / tesseract / tessdata / configs / makebox
/ usr / share / tesseract / tessdata / configs / pdf
/ usr / share / tesseract / tessdata / configs / quiet
/ usr / share / tesseract / tessdata / configs / rebox
/ usr / share / tesseract / tessdata / configs / strokewidth
/ usr / share / tesseract / tessdata / configs / unlv
/ usr / share / tesseract / tessdata / eng.cube.bigrams
/ usr / share / tesseract / tessdata / eng.cube.fold
/ usr / share / tesseract / tessdata / eng.cube.lm
/ usr / share / tesseract / tessdata / eng.cube.nn
/ usr / share / tesseract / tessdata / eng.cube. params
/ usr / share / tesseract / tessdata / eng.cube.size
/ usr / share / tesseract / tessdata / eng.cube.word - freq
/ usr / share / tesseract / tessdata / eng.tesseract_cube.nn
/ usr / share / tesseract / tessdata / eng.traineddata
/ usr / share / tesseract / tessdata / pdf.ttf
/ usr / share / tesseract / tessdata / tessconfigs
/ usr / share / tesseract / tessdata / tessconfigs / batch
/ usr / share / tesseract / tessdata / tessconfigs / batch.nochop
/ usr / share / tesseract / tessdata / tessconfigs / matdemo
/ usr / share / tesseract / tessdata / tessconfigs / msdemo
/ usr / share / tesseract / tessdata / tessconfigs / nobatch
/ usr / share / tesseract / tessdata / tessconfigs / segdemo

查看.so文件接口

    nm -D xxx.so