最近喜欢上高品质音乐了,但是cue很多抓下来是繁体中文,看上去略不爽。大陆的windows都是用的GBK编码,准备拿来转码一下。分三步。
第一步,下载GBK中文字库。放到wps里,然后进行繁简体转换,得到繁简对应。放到txt文件中,使用ANSI编码。
第二步,对字库进行预处理。包括两部分:去重,排序。
去重指把繁简体相同的字符去掉。
代码:
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; /* * @author GT * find the different characters in two files * 2013.1.17 * */ public class FindDifferent { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub System.out.println(System.getProperty("file.encoding")); if (args.length != 2) { System.err.println("not enough files"); return; } else { BufferedReader br1 = new BufferedReader(new FileReader(args[0])); BufferedReader br2 = new BufferedReader(new FileReader(args[1])); BufferedWriter bw1 = new BufferedWriter(new FileWriter("d" + args[0])); BufferedWriter bw2 = new BufferedWriter(new FileWriter("d" + args[1])); String line1 = null; String line2 = null; while ((line1 = br1.readLine()) != null && (line2 = br2.readLine()) != null) { line1 = line1.trim(); line2 = line2.trim(); if (line1.length() != line2.length()) { System.err.println("not same length " + line1.length() + " " + line2.length()); continue; } else { for (int i = 0; i < line1.length(); ++i) { if (line1.charAt(i) != line2.charAt(i)) { bw1.append(line1.charAt(i)); bw2.append(line2.charAt(i)); } } } } bw1.flush(); bw2.flush(); br1.close(); br2.close(); bw1.close(); bw2.close(); } } }转好的文件已经上传,地址:http://download.csdn.net/detail/pouloghost/5005734
排序使用堆排,需要对繁简体两个文件进行相同操作,以保证对应。代码
import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; /* * @author GT * sort the characters in ascending order * from trad.txt simp.txt to traditional.txt simple.txt * 2013.1.17 * */ public class SortFile { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub BufferedReader br = new BufferedReader(new FileReader("trad.txt")); StringBuffer trad = new StringBuffer(br.readLine()); br.close(); br = new BufferedReader(new FileReader("simp.txt")); StringBuffer simp = new StringBuffer(br.readLine()); br.close(); buildHeap(trad, simp); // System.out.println(trad); int size = trad.length(); for (int i = trad.length() - 1; i > 0; --i) { exchange(trad, simp, 0, i); --size; maxHeapify(trad, simp, 0, size); } System.out.println("traditional in order " + check(trad)); // System.out.println(trad); // System.out.println(check(simp)); FileWriter fr = new FileWriter("traditional.txt"); fr.write(trad.toString()); fr.flush(); fr.close(); fr = new FileWriter("simple.txt"); fr.write(simp.toString()); fr.flush(); fr.close(); } private static void maxHeapify(StringBuffer trad, StringBuffer simp, int i, int size) { int left = 2 * i + 1; int right = 2 * i + 2; int max = i; if (left < size && ((int) trad.charAt(i) < (int) trad.charAt(left))) { max = left; } if (right < size && ((int) trad.charAt(max) < (int) trad.charAt(right))) { max = right; } if (max != i) { exchange(trad, simp, i, max); maxHeapify(trad, simp, max, size); } } private static void exchange(StringBuffer trad, StringBuffer simp, int a, int b) { char tradTemp = trad.charAt(a); char simpTemp = simp.charAt(a); trad.setCharAt(a, trad.charAt(b)); simp.setCharAt(a, simp.charAt(b)); trad.setCharAt(b, tradTemp); simp.setCharAt(b, simpTemp); } private static void buildHeap(StringBuffer trad, StringBuffer simp) { for (int i = trad.length() / 2; i > -1; --i) { maxHeapify(trad, simp, i, trad.length()); } } private static boolean check(StringBuffer trad) { boolean res = true; for (int i = 0; i < trad.length() - 1; ++i) { if ((int) trad.charAt(i) > (int) trad.charAt(i + 1)) { System.out.println(i); res = false; break; } } return res; } }
第三步,使用繁简体对应表,处理文件。代码
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; /* * @author GT * change all tradition Chinese in the file and store the simplified version in simple+filename * all encoded by GBK * 2013.1.17 * */ public class Main { static String traditional = null; static String simple = null; /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub if (args.length != 1) { System.err.println("not enough files"); return; } else { // initial initial(); // BufferedReader br = new BufferedReader(new FileReader(args[0])); BufferedWriter bw = new BufferedWriter(new FileWriter("simple" + args[0])); String line = null; while ((line = br.readLine()) != null) { simplify(line, bw); bw.newLine(); } bw.flush(); br.close(); bw.close(); } } private static void initial() throws IOException { BufferedReader br = new BufferedReader( new FileReader("traditional.txt")); traditional = br.readLine(); br.close(); br = new BufferedReader(new FileReader("simple.txt")); simple = br.readLine(); br.close(); // for (int i = 0; i < 100; ++i) { // System.out.printf("%d ", (int) tradition.charAt(i)); // } } private static void simplify(String line, BufferedWriter bw) throws IOException { // TODO Auto-generated method stub int index = -1; for (int i = 0; i < line.length(); ++i) { if ((index = find(line.charAt(i))) != -1) { bw.append(simple.charAt(index)); } else { bw.append(line.charAt(i)); } } } /* * binary search 2013.1.18 */ private static int find(char ch) { // TODO Auto-generated method stub int low, high, mid, res; low = 0; high = traditional.length(); res = -1; while (low <= high) { mid = (low + high) / 2; if (traditional.charAt(mid) == ch) { res = mid; break; } else { if ((int) traditional.charAt(mid) < (int) ch) { low = mid + 1; } else { high = mid - 1; } } } return res; // return tradition.indexOf(ch); } }