1、安装过程:
安装ocr
yum install tesseract-ocr
查找中文包
yum search tesseract-ocr | grep sim
安装中文包
yum install tesseract-langpack-chi_sim
安装版本信息:
? test-ugc-api01 tesseract tesseract -v
tesseract 3.04.00
leptonica-1.72
libgif 4.1.6(?) : libjpeg 6b (libjpeg-turbo 1.2.90) : libpng 1.5.13 : libtiff 4.0.3 : zlib 1.2.7 : libwebp 0.3.0
2、java开发
注意版本匹配:3.04.00,采用tess4j
< dependency >
< groupId > net.sourceforge.tess4j </ groupId >
< artifactId > tess4j </ artifactId >
< version > 3.0 . 0 </ version >
</ dependency >
简单测试代码
public String ocr(String url) {
String datapath = " /usr/share/tesseract/ " ;
String language = " chi_sim " ;
// 进行相关的检测
try {
url = url.trim();
System. out .println( " url is: " + url);
URL targetUrl = new URL(url);
BufferedImage image = ImageIO.read(targetUrl);
ByteBuffer buf = ImageIOHelper.convertImageData(image);
int bpp = image.getColorModel().getPixelSize();
int bytespp = bpp / 8 ;
int bytespl = ( int ) Math.ceil(image.getWidth() * bpp / 8.0 );
System. out .println( " bpp is: " + bpp + " ;bytespp is: " + bytespp + " ;bytespl is: " + bytespl);
// 初始化
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
TessAPI1.TessBaseAPIInit3(handle, datapath, language);
TessAPI1.TessBaseAPISetPageSegMode(handle, ITessAPI.TessPageSegMode.PSM_AUTO);
Pointer utf8Text = TessAPI1.TessBaseAPIRect(handle, buf, bytespp, bytespl, 0 , 0 , image.getWidth(), image.getHeight());
String result = utf8Text.getString( 0 );
TessAPI1.TessDeleteText(utf8Text);
TessAPI1.TessBaseAPIDelete(handle);
System. out .println( " ============================================== " );
System. out .println( " result is: " + result);
System. out .println( " ============================================== " );
if (result.equalsIgnoreCase( "" )){
System. out .println( " no detected words!! " );
}
return result;
} catch (Exception ex){
ex.printStackTrace();
}
return " no detected words!! " ;
}
注意:datapath要设置在tessdata的上一级目录
3、yum安装所在目录查询相关命令
#查询相关包
test - ugc - api01 tesseract rpm - qa | grep tesseract
tesseract - langpack - chi_sim - 3.04 . 00 - 3 .el7.noarch
tesseract - 3.04 . 00 - 3 .el7.x86_64
#查询包具体安装位置
test - ugc - api01 tesseract rpm - ql tesseract - 3.04 . 00 - 3 .el7.x86_64
/ usr / bin / ambiguous_words
/ usr / bin / classifier_tester
/ usr / bin / cntraining
/ usr / bin / combine_tessdata
/ usr / bin / dawg2wordlist
/ usr / bin / mftraining
/ usr / bin / set_unicharset_properties
/ usr / bin / shapeclustering
/ usr / bin / tesseract
/ usr / bin / text2image
/ usr / bin / unicharset_extractor
/ usr / bin / wordlist2dawg
/ usr / lib64 / libtesseract.so. 3
/ usr / lib64 / libtesseract.so. 3.0 . 4
/ usr / share / doc / tesseract - 3.04 . 00
/ usr / share / doc / tesseract - 3.04 . 00 / AUTHORS
/ usr / share / doc / tesseract - 3.04 . 00 / ChangeLog
/ usr / share / doc / tesseract - 3.04 . 00 / NEWS
/ usr / share / doc / tesseract - 3.04 . 00 / README
/ usr / share / doc / tesseract - 3.04 . 00 / eurotext.tif
/ usr / share / doc / tesseract - 3.04 . 00 / phototest.tif
/ usr / share / licenses / tesseract - 3.04 . 00
/ usr / share / licenses / tesseract - 3.04 . 00 / COPYING
/ usr / share / man / man1 / ambiguous_words. 1 .gz
/ usr / share / man / man1 / cntraining. 1 .gz
/ usr / share / man / man1 / combine_tessdata. 1 .gz
/ usr / share / man / man1 / dawg2wordlist. 1 .gz
/ usr / share / man / man1 / mftraining. 1 .gz
/ usr / share / man / man1 / shapeclustering. 1 .gz
/ usr / share / man / man1 / tesseract. 1 .gz
/ usr / share / man / man1 / unicharset_extractor. 1 .gz
/ usr / share / man / man1 / wordlist2dawg. 1 .gz
/ usr / share / man / man5 / unicharambigs. 5 .gz
/ usr / share / man / man5 / unicharset. 5 .gz
/ usr / share / tesseract
/ usr / share / tesseract / tessdata
/ usr / share / tesseract / tessdata / configs
/ usr / share / tesseract / tessdata / configs / ambigs.train
/ usr / share / tesseract / tessdata / configs / api_config
/ usr / share / tesseract / tessdata / configs / bigram
/ usr / share / tesseract / tessdata / configs / box.train
/ usr / share / tesseract / tessdata / configs / box.train.stderr
/ usr / share / tesseract / tessdata / configs / digits
/ usr / share / tesseract / tessdata / configs / hocr
/ usr / share / tesseract / tessdata / configs / inter
/ usr / share / tesseract / tessdata / configs / kannada
/ usr / share / tesseract / tessdata / configs / linebox
/ usr / share / tesseract / tessdata / configs / logfile
/ usr / share / tesseract / tessdata / configs / makebox
/ usr / share / tesseract / tessdata / configs / pdf
/ usr / share / tesseract / tessdata / configs / quiet
/ usr / share / tesseract / tessdata / configs / rebox
/ usr / share / tesseract / tessdata / configs / strokewidth
/ usr / share / tesseract / tessdata / configs / unlv
/ usr / share / tesseract / tessdata / eng.cube.bigrams
/ usr / share / tesseract / tessdata / eng.cube.fold
/ usr / share / tesseract / tessdata / eng.cube.lm
/ usr / share / tesseract / tessdata / eng.cube.nn
/ usr / share / tesseract / tessdata / eng.cube. params
/ usr / share / tesseract / tessdata / eng.cube.size
/ usr / share / tesseract / tessdata / eng.cube.word - freq
/ usr / share / tesseract / tessdata / eng.tesseract_cube.nn
/ usr / share / tesseract / tessdata / eng.traineddata
/ usr / share / tesseract / tessdata / pdf.ttf
/ usr / share / tesseract / tessdata / tessconfigs
/ usr / share / tesseract / tessdata / tessconfigs / batch
/ usr / share / tesseract / tessdata / tessconfigs / batch.nochop
/ usr / share / tesseract / tessdata / tessconfigs / matdemo
/ usr / share / tesseract / tessdata / tessconfigs / msdemo
/ usr / share / tesseract / tessdata / tessconfigs / nobatch
/ usr / share / tesseract / tessdata / tessconfigs / segdemo
查看.so文件接口
nm -D xxx.so