2012-04-16 18:36:53.0|分类: java|浏览量: 1580
这里需需要注意一下: 如果txt文件的编码不是utf-8会是乱码,所以需要设置一下txt的编码。
package com.java.hanzi.utf; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TxtCount { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub File file = new File("D:\\2012-0414.txt"); try { //FileInputStream fin = new FileInputStream(file); //FileReader默认使用的是GBK,查看123.txt文件的编码格式 // FileInputStreamReader(new InputStreamReader(new FileInputStream("path")),"UTF-8") // int count = 0; // FileReader fr = new FileReader(file); //System.out.println("fr.getEncoding()="+fr.getEncoding()); // BufferedReader bf = new BufferedReader(fr); BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8")); // System.out.println("fr.getEncoding()="+fr.getEncoding()); String str = null; while((str=br.readLine())!=null){ count = count + calculator(str); } System.out.println("-----"+count); br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static int calculator(String str){ int count = 0; String regEx = "[\\u4e00-\\u9fa5]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); while (m.find()) { for (int i = 0; i <= m.groupCount(); i++) { count = count + 1; } } System.out.println(str); System.out.println("共有 " + count + "个 "); return count; } } |