001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.Iterator; 011 import java.util.LinkedHashMap; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.edge.Edge; 023 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 import org.maltparser.ml.libsvm.LibsvmException; 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraWriter implements SyntaxGraphWriter { 032 private BufferedWriter writer; 033 private DataFormatInstance dataFormatInstance; 034 private String optionString; 035 private int sentenceCount; 036 private LinkedHashMap<Integer, Integer> nonTerminalIndexMap; 037 private int START_ID_OF_NONTERMINALS = 500; 038 private boolean closeStream = true; 039 040 public NegraWriter() { 041 nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>(); 042 } 043 044 public void open(String fileName, String charsetName) throws MaltChainedException { 045 try { 046 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 047 } catch (FileNotFoundException e) { 048 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 049 } catch (UnsupportedEncodingException e) { 050 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 051 } 052 } 053 054 public void open(OutputStream os, String charsetName) throws MaltChainedException { 055 try { 056 if (os == System.out || os == System.err) { 057 closeStream = false; 058 } 059 open(new OutputStreamWriter(os, charsetName)); 060 } catch (UnsupportedEncodingException e) { 061 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 062 } 063 } 064 065 private void open(OutputStreamWriter osw) throws MaltChainedException { 066 setWriter(new BufferedWriter(osw)); 067 setSentenceCount(0); 068 } 069 070 public void writeProlog() throws MaltChainedException { } 071 072 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 073 if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) { 074 return; 075 } 076 PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 077 sentenceCount++; 078 try { 079 writer.write("#BOS "); 080 if (phraseStructure.getSentenceID() != 0) { 081 writer.write(Integer.toString(phraseStructure.getSentenceID())); 082 } else { 083 writer.write(Integer.toString(sentenceCount)); 084 } 085 writer.write('\n'); 086 087 if (phraseStructure.hasNonTerminals()) { 088 calculateIndices(phraseStructure); 089 writeTerminals(phraseStructure); 090 writeNonTerminals(phraseStructure); 091 } else { 092 writeTerminals(phraseStructure); 093 } 094 writer.write("#EOS "); 095 if (phraseStructure.getSentenceID() != 0) { 096 writer.write(Integer.toString(phraseStructure.getSentenceID())); 097 } else { 098 writer.write(Integer.toString(sentenceCount)); 099 } 100 writer.write('\n'); 101 } catch (IOException e) { 102 throw new DataFormatException("Could not write to the output file. ", e); 103 } 104 } 105 public void writeEpilog() throws MaltChainedException { } 106 107 108 private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException { 109 final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 110 for (int index : phraseStructure.getNonTerminalIndices()) { 111 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 112 } 113 114 boolean done = false; 115 int h = 1; 116 int ntid = START_ID_OF_NONTERMINALS; 117 nonTerminalIndexMap.clear(); 118 while (!done) { 119 done = true; 120 for (int index : phraseStructure.getNonTerminalIndices()) { 121 if (heights.get(index) == h) { 122 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 123 nonTerminalIndexMap.put(nt.getIndex(), ntid++); 124 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 125 done = false; 126 } 127 } 128 h++; 129 } 130 131 // boolean done = false; 132 // int h = 1; 133 //// int ntid = START_ID_OF_NONTERMINALS; 134 //// nonTerminalIndexMap.clear(); 135 // while (!done) { 136 // done = true; 137 // for (int index : phraseStructure.getNonTerminalIndices()) { 138 // if (heights.get(index) == h) { 139 // NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 140 //// nonTerminalIndexMap.put(nt.getIndex(), ntid++); 141 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 142 // done = false; 143 // } 144 // } 145 // h++; 146 // } 147 } 148 149 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 150 try { 151 for (int index : phraseStructure.getTokenIndices()) { 152 final PhraseStructureNode terminal = phraseStructure.getTokenNode(index); 153 final Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 154 ColumnDescription column = null; 155 int ti = 1; 156 while (columns.hasNext()) { 157 column = columns.next(); 158 if (column.getCategory() == ColumnDescription.INPUT) { 159 writer.write(terminal.getLabelSymbol(column.getSymbolTable())); 160 int nTabs = 1; 161 if (ti == 1 || ti == 2) { 162 nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 163 } else if (ti == 3) { 164 nTabs = 1; 165 } else if (ti == 4) { 166 nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 167 } 168 if (nTabs < 1) { 169 nTabs = 1; 170 } 171 for (int j = 0; j < nTabs; j++) { 172 writer.write('\t'); 173 } 174 ti++; 175 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 176 if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) { 177 writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable())); 178 writer.write('\t'); 179 } else { 180 writer.write("--\t"); 181 } 182 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 183 if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) { 184 writer.write('0'); 185 } else { 186 writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex()))); 187 // writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 188 } 189 } 190 } 191 for (Edge e : terminal.getIncomingSecondaryEdges()) { 192 if (e.hasLabel(column.getSymbolTable())) { 193 writer.write('\t'); 194 writer.write(e.getLabelSymbol(column.getSymbolTable())); 195 writer.write('\t'); 196 if (e.getSource() instanceof NonTerminalNode) { 197 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 198 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 199 } else { 200 writer.write(Integer.toString(e.getSource().getIndex())); 201 } 202 } 203 } 204 writer.write("\n"); 205 } 206 207 } catch (IOException e) { 208 throw new DataFormatException("The Negra writer is not able to write. ", e); 209 } 210 } 211 212 private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 213 for (int index : nonTerminalIndexMap.keySet()) { 214 // for (int index : phraseStructure.getNonTerminalIndices()) { 215 NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 216 217 if (nonTerminal == null || nonTerminal.isRoot()) { 218 return; 219 } 220 try { 221 writer.write('#'); 222 // writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1)); 223 writer.write(Integer.toString(nonTerminalIndexMap.get(index))); 224 writer.write("\t\t\t--\t\t\t"); 225 if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) { 226 writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())); 227 } else { 228 writer.write("--"); 229 } 230 writer.write("\t--\t\t"); 231 if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) { 232 writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())); 233 } else { 234 writer.write("--"); 235 } 236 writer.write('\t'); 237 if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) { 238 writer.write('0'); 239 } else { 240 // writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 241 writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex()))); 242 } 243 for (Edge e : nonTerminal.getIncomingSecondaryEdges()) { 244 if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) { 245 writer.write('\t'); 246 writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())); 247 writer.write('\t'); 248 if (e.getSource() instanceof NonTerminalNode) { 249 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 250 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 251 } else { 252 writer.write(Integer.toString(e.getSource().getIndex())); 253 } 254 } 255 } 256 writer.write("\n"); 257 } catch (IOException e) { 258 throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e); 259 } 260 } 261 } 262 263 public BufferedWriter getWriter() { 264 return writer; 265 } 266 267 public void setWriter(BufferedWriter writer) { 268 this.writer = writer; 269 } 270 271 public int getSentenceCount() { 272 return sentenceCount; 273 } 274 275 public void setSentenceCount(int sentenceCount) { 276 this.sentenceCount = sentenceCount; 277 } 278 279 public DataFormatInstance getDataFormatInstance() { 280 return dataFormatInstance; 281 } 282 283 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 284 this.dataFormatInstance = dataFormatInstance; 285 } 286 287 public String getOptions() { 288 return optionString; 289 } 290 291 public void setOptions(String optionString) throws MaltChainedException { 292 this.optionString = optionString; 293 String[] argv; 294 try { 295 argv = optionString.split("[_\\p{Blank}]"); 296 } catch (PatternSyntaxException e) { 297 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 298 } 299 for (int i=0; i < argv.length-1; i++) { 300 if(argv[i].charAt(0) != '-') { 301 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 302 } 303 if(++i>=argv.length) { 304 throw new DataFormatException("The last argument does not have any value. "); 305 } 306 switch(argv[i-1].charAt(1)) { 307 case 's': 308 try { 309 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 310 } catch (NumberFormatException e){ 311 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 312 } 313 break; 314 default: 315 throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 316 } 317 } 318 } 319 320 public void close() throws MaltChainedException { 321 try { 322 if (writer != null) { 323 writer.flush(); 324 if (closeStream) { 325 writer.close(); 326 } 327 writer = null; 328 } 329 } catch (IOException e) { 330 throw new DataFormatException("Could not close the output file. ", e); 331 } 332 } 333 }