Java实现GBK编码文件繁简体转换

时间:2023-01-14 14:34:11

最近喜欢上高品质音乐了,但是cue很多抓下来是繁体中文,看上去略不爽。大陆的windows都是用的GBK编码,准备拿来转码一下。分三步。

第一步,下载GBK中文字库。放到wps里,然后进行繁简体转换,得到繁简对应。放到txt文件中,使用ANSI编码。

第二步,对字库进行预处理。包括两部分:去重,排序。

去重指把繁简体相同的字符去掉。

代码:


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

/*
 * @author GT
 * find the different characters in two files
 * 2013.1.17
 * */
public class FindDifferent {

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		System.out.println(System.getProperty("file.encoding"));
		if (args.length != 2) {
			System.err.println("not enough files");
			return;
		} else {
			BufferedReader br1 = new BufferedReader(new FileReader(args[0]));
			BufferedReader br2 = new BufferedReader(new FileReader(args[1]));
			BufferedWriter bw1 = new BufferedWriter(new FileWriter("d"
					+ args[0]));
			BufferedWriter bw2 = new BufferedWriter(new FileWriter("d"
					+ args[1]));
			String line1 = null;
			String line2 = null;
			while ((line1 = br1.readLine()) != null
					&& (line2 = br2.readLine()) != null) {
				line1 = line1.trim();
				line2 = line2.trim();
				if (line1.length() != line2.length()) {
					System.err.println("not same length " + line1.length()
							+ " " + line2.length());
					continue;
				} else {
					for (int i = 0; i < line1.length(); ++i) {
						if (line1.charAt(i) != line2.charAt(i)) {
							bw1.append(line1.charAt(i));
							bw2.append(line2.charAt(i));
						}
					}
				}
			}
			bw1.flush();
			bw2.flush();
			br1.close();
			br2.close();
			bw1.close();
			bw2.close();
		}
	}
}
转好的文件已经上传,地址:http://download.csdn.net/detail/pouloghost/5005734

排序使用堆排,需要对繁简体两个文件进行相同操作,以保证对应。代码


import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

/*
 * @author GT
 * sort the characters in ascending order
 * from trad.txt simp.txt to traditional.txt simple.txt
 * 2013.1.17
 * */
public class SortFile {

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		BufferedReader br = new BufferedReader(new FileReader("trad.txt"));
		StringBuffer trad = new StringBuffer(br.readLine());
		br.close();
		br = new BufferedReader(new FileReader("simp.txt"));
		StringBuffer simp = new StringBuffer(br.readLine());
		br.close();
		buildHeap(trad, simp);
		// System.out.println(trad);
		int size = trad.length();
		for (int i = trad.length() - 1; i > 0; --i) {
			exchange(trad, simp, 0, i);
			--size;
			maxHeapify(trad, simp, 0, size);
		}
		System.out.println("traditional in order " + check(trad));
		// System.out.println(trad);
		// System.out.println(check(simp));
		FileWriter fr = new FileWriter("traditional.txt");
		fr.write(trad.toString());
		fr.flush();
		fr.close();
		fr = new FileWriter("simple.txt");
		fr.write(simp.toString());
		fr.flush();
		fr.close();
	}

	private static void maxHeapify(StringBuffer trad, StringBuffer simp, int i,
			int size) {
		int left = 2 * i + 1;
		int right = 2 * i + 2;
		int max = i;
		if (left < size && ((int) trad.charAt(i) < (int) trad.charAt(left))) {
			max = left;
		}
		if (right < size && ((int) trad.charAt(max) < (int) trad.charAt(right))) {
			max = right;
		}
		if (max != i) {
			exchange(trad, simp, i, max);
			maxHeapify(trad, simp, max, size);
		}

	}

	private static void exchange(StringBuffer trad, StringBuffer simp, int a,
			int b) {
		char tradTemp = trad.charAt(a);
		char simpTemp = simp.charAt(a);
		trad.setCharAt(a, trad.charAt(b));
		simp.setCharAt(a, simp.charAt(b));
		trad.setCharAt(b, tradTemp);
		simp.setCharAt(b, simpTemp);
	}

	private static void buildHeap(StringBuffer trad, StringBuffer simp) {
		for (int i = trad.length() / 2; i > -1; --i) {
			maxHeapify(trad, simp, i, trad.length());
		}
	}

	private static boolean check(StringBuffer trad) {
		boolean res = true;
		for (int i = 0; i < trad.length() - 1; ++i) {
			if ((int) trad.charAt(i) > (int) trad.charAt(i + 1)) {
				System.out.println(i);
				res = false;
				break;
			}
		}
		return res;
	}
}

第三步,使用繁简体对应表,处理文件。代码


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

/*
 * @author GT
 * change all tradition Chinese in the file and store the simplified version in simple+filename 
 * all encoded by GBK
 * 2013.1.17
 * */
public class Main {
	static String traditional = null;
	static String simple = null;

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		if (args.length != 1) {
			System.err.println("not enough files");
			return;
		} else {
			// initial
			initial();
			//
			BufferedReader br = new BufferedReader(new FileReader(args[0]));
			BufferedWriter bw = new BufferedWriter(new FileWriter("simple"
					+ args[0]));
			String line = null;
			while ((line = br.readLine()) != null) {
				simplify(line, bw);
				bw.newLine();
			}
			bw.flush();
			br.close();
			bw.close();
		}
	}

	private static void initial() throws IOException {
		BufferedReader br = new BufferedReader(
				new FileReader("traditional.txt"));
		traditional = br.readLine();
		br.close();
		br = new BufferedReader(new FileReader("simple.txt"));
		simple = br.readLine();
		br.close();
		// for (int i = 0; i < 100; ++i) {
		// System.out.printf("%d ", (int) tradition.charAt(i));
		// }
	}

	private static void simplify(String line, BufferedWriter bw)
			throws IOException {
		// TODO Auto-generated method stub
		int index = -1;
		for (int i = 0; i < line.length(); ++i) {
			if ((index = find(line.charAt(i))) != -1) {
				bw.append(simple.charAt(index));
			} else {
				bw.append(line.charAt(i));
			}
		}
	}

	/*
	 * binary search 2013.1.18
	 */
	private static int find(char ch) {
		// TODO Auto-generated method stub
		int low, high, mid, res;
		low = 0;
		high = traditional.length();
		res = -1;
		while (low <= high) {
			mid = (low + high) / 2;
			if (traditional.charAt(mid) == ch) {
				res = mid;
				break;
			} else {
				if ((int) traditional.charAt(mid) < (int) ch) {
					low = mid + 1;
				} else {
					high = mid - 1;
				}
			}
		}
		return res;
		// return tradition.indexOf(ch);
	}
}