November 15(Mon), 2010 結局ファイル書き出しでmecab&cabocha
■[Java][Cabocha][Mecab][NLP]結局ファイル書き出しでmecab&cabocha
パフォーマンス低下が著しいし,TOKENの設定が難しいので,入力データも出力データも一旦ファイルシステムを使うことに.
将来的にはtmpfsを使えばいいかな.
オプション指定があるので,前回とは違い,mecabおよびcabocha専用.
CabochaTest.java
import java.io.*; import org.jpn.syo.*; /** * * @author SyoTakasaki */ public class CabochaTest { /** * @param args the command line arguments */ public static void main(String[] args) throws Exception { // INIT String file_text = args[0]; String data_text = null; String data_mecab = null; String data_cabocha = null; // テキストファイルの読み込み data_text = read_file_as_text(file_text); data_text = data_text.replaceAll("。", "。\n"); data_text = data_text.replaceAll(".", ".\n"); // Mecab data_mecab = Execute.exec("mecab", data_text); // Cabocha data_cabocha = Execute.exec("cabocha", data_mecab); System.out.println(data_mecab); } private static String read_file_as_text(String file_path) throws FileNotFoundException, UnsupportedEncodingException, IOException{ StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file_path), "UTF-8")); String line; while((line = br.readLine()) != null){ sb.append(line).append("\n"); } br.close(); return sb.toString(); } }
org/jpn/syo/Execute.java
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package org.jpn.syo; import java.io.*; /** * * @author SyoTakasaki */ public class Execute { private static final java.util.Random random = new java.util.Random(); // cmdには"mecab"あるいは"cabocha"を受け付ける public static String exec(String cmd, String inputText) throws IOException, InterruptedException, Exception { String result = null; String file_name = null; String command = null; // ファイル名用ランダム文字列 file_name = Long.toHexString(random.nextLong()); file_name = new File(file_name).getAbsolutePath(); // テキストファイルへの書き出し write_file_as_text(file_name + ".in", inputText); // Mecabの場合 if(cmd.equals("mecab")){ command = "/usr/bin/mecab " + file_name + ".in --output=" + file_name + ".out"; } // Cabochaの場合 if(cmd.equals("cabocha")){ command = "/usr/bin/cabocha -f1 -I1 " + file_name + ".in --output=" + file_name + ".out"; } if(command == null){ Exception ex = new Exception("UnsupportedCommandException"); throw ex; } // 実行 Process p = new ProcessBuilder(command.split(" ")).start(); p.waitFor(); p.destroy(); // 結果の読み込み result = read_file_as_text(file_name + ".out"); // ファイル削除 new File(file_name + ".in").delete(); new File(file_name + ".out").delete(); return result; } private static String read_file_as_text(String file_path) throws FileNotFoundException, UnsupportedEncodingException, IOException{ StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file_path), "UTF-8")); String line; while((line = br.readLine()) != null){ sb.append(line).append("\n"); } br.close(); return sb.toString(); } private static void write_file_as_text(String file_path, String text) throws UnsupportedEncodingException, IOException { BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file_path), "UTF-8")); bw.write(text); bw.flush(); bw.close(); } }
トラックバック - http://d.hatena.ne.jp/Syo-Takasaki/20101115/1289823589
リンク元
- 12 http://www.google.co.jp/search?hl=ja&lr=lang_ja&tbs=lr:lang_1ja&q=ubuntu+sun+JVM&aq=f&aqi=&aql=&oq=&gs_rfai=
- 11 http://www.google.com/cse?cx=partner-pub-9300639326172081:d9bbzbtli15&ie=UTF-8&sa=Search&q=ubuntsu+jdk+インストール&hl=ja
- 10 http://www.google.co.jp/search?sourceid=navclient&hl=ja&ie=UTF-8&rlz=1T4SUNC_jaJP354JP355&q=ubuntu+jdk
- 10 http://www.google.com/custom?hl=ja&client=pub-9300639326172081&cof=FORID:13;AH:left;CX:Ubuntu%2010%2E10;L:http://www.google.com/intl/ja/images/logos/custom_search_logo_sm.gif;LH:30;LP:1;LC:#0000ff;VLC:#663399;DIV:#336699;&adkw=
- 9 http://www.google.com/search?client=ubuntu&channel=fs&q=ubuntu+java+sdk+&ie=utf-8&oe=utf-8
- 7 http://www.google.co.jp/search?sourceid=chrome&ie=UTF-8&q=netbeans+6.9+文字化け+ubuntu
- 6 http://www.google.co.jp/search?hl=ja&lr=lang_ja&client=firefox-a&rls=com.ubuntu:ja:unofficial&tbs=lr:lang_1ja&q=ubuntu+Can't+locate+loadable+object+for+module&aq=f&aqi=&aql=&oq=&gs_rfai=
- 6 http://www.google.co.jp/search?q=VMware+イメージコピー&ie=utf-8&oe=utf-8&aq=t&hl=ja
- 6 http://www.google.co.jp/search?sourceid=chrome&client=ubuntu&channel=cs&ie=UTF-8&q=ubuntu+jdk
- 6 http://www.google.com/search?client=ubuntu&channel=fs&q=sun-java6-jdk」&ie=utf-8&oe=utf-8