001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 021 import org.maltparser.core.syntaxgraph.PhraseStructure; 022 import org.maltparser.core.syntaxgraph.TokenStructure; 023 import org.maltparser.core.syntaxgraph.edge.Edge; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraReader implements SyntaxGraphReader { 032 private enum NegraTables { 033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF 034 }; 035 private BufferedReader reader; 036 private DataFormatInstance dataFormatInstance; 037 private int sentenceCount; 038 private String optionString; 039 private int formatVersion; 040 private NegraTables currentHeaderTable; 041 private int currentTerminalSize; 042 private int currentNonTerminalSize; 043 private SortedMap<Integer,PhraseStructureNode> nonterminals; 044 private StringBuilder edgelabelSymbol; 045 private StringBuilder edgelabelTableName; 046 private int START_ID_OF_NONTERMINALS = 500; 047 private String fileName = null; 048 private URL url = null; 049 private String charsetName; 050 private int nIterations; 051 private int cIterations; 052 private boolean closeStream = true; 053 054 public NegraReader() { 055 currentHeaderTable = NegraTables.UNDEF; 056 edgelabelSymbol = new StringBuilder(); 057 edgelabelTableName = new StringBuilder(); 058 nonterminals = new TreeMap<Integer,PhraseStructureNode>(); 059 nIterations = 1; 060 cIterations = 1; 061 } 062 063 private void reopen() throws MaltChainedException { 064 close(); 065 if (fileName != null) { 066 open(fileName, charsetName); 067 } else if (url != null) { 068 open(url, charsetName); 069 } else { 070 throw new DataFormatException("The input stream cannot be reopen. "); 071 } 072 } 073 074 public void open(String fileName, String charsetName) throws MaltChainedException { 075 setFileName(fileName); 076 setCharsetName(charsetName); 077 try { 078 open(new FileInputStream(fileName), charsetName); 079 } catch (FileNotFoundException e) { 080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 081 } 082 } 083 public void open(URL url, String charsetName) throws MaltChainedException { 084 setUrl(url); 085 setCharsetName(charsetName); 086 try { 087 open(url.openStream(), charsetName); 088 } catch (IOException e) { 089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 090 } 091 } 092 093 public void open(InputStream is, String charsetName) throws MaltChainedException { 094 try { 095 if (is == System.in) { 096 closeStream = false; 097 } 098 open(new InputStreamReader(is, charsetName)); 099 } catch (UnsupportedEncodingException e) { 100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 101 } 102 } 103 104 private void open(InputStreamReader isr) throws MaltChainedException { 105 setReader(new BufferedReader(isr)); 106 setSentenceCount(0); 107 } 108 109 public void readProlog() throws MaltChainedException { 110 111 } 112 113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 114 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 115 return false; 116 } 117 syntaxGraph.clear(); 118 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 119 PhraseStructureNode parent = null; 120 PhraseStructureNode child = null; 121 currentHeaderTable = NegraTables.UNDEF; 122 String line = null; 123 syntaxGraph.clear(); 124 nonterminals.clear(); 125 try { 126 while (true) { 127 line = reader.readLine(); 128 if (line == null) { 129 if (syntaxGraph.hasTokens()) { 130 sentenceCount++; 131 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 132 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 133 } 134 } 135 if (cIterations < nIterations) { 136 cIterations++; 137 reopen(); 138 return true; 139 } 140 return false; 141 } else if (line.startsWith("#EOS")) { 142 currentTerminalSize = 0; 143 currentNonTerminalSize = 0; 144 currentHeaderTable = NegraTables.UNDEF; 145 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 146 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 147 } 148 return true; 149 } else if (line.startsWith("#BOS")) { 150 currentHeaderTable = NegraTables.SENTENCE; 151 int s = -1, e = -1; 152 for (int i = 5, n = line.length(); i < n; i++) { 153 if (Character.isDigit(line.charAt(i)) && s == -1) { 154 s = i; 155 } 156 if (line.charAt(i) == ' ') { 157 e = i; 158 break; 159 } 160 } 161 if (s != e && s != -1 && e != -1) { 162 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e))); 163 } 164 sentenceCount++; 165 } else if (currentHeaderTable == NegraTables.SENTENCE) { 166 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal 167 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 168 ColumnDescription column = null; 169 currentNonTerminalSize++; 170 char[] lineChars = line.toCharArray(); 171 int start = 0; 172 int secedgecounter = 0; 173 for (int i = 0, n = lineChars.length; i < n; i++) { 174 if (lineChars[i] == '\t' && start == i) { 175 start++; 176 } else if (lineChars[i] == '\t' || i == n - 1) { 177 if (columns.hasNext()) { 178 column = columns.next(); 179 } 180 if (column.getPosition() == 0) { 181 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i)); 182 child = nonterminals.get(index); 183 if (child == null) { 184 if (index != 0) { 185 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 186 } 187 nonterminals.put(index,child); 188 } 189 } else if (column.getPosition() == 2 && child != null) { 190 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i)); 191 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 192 edgelabelSymbol.setLength(0); 193 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 194 edgelabelTableName.setLength(0); 195 edgelabelTableName.append(column.getName()); 196 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 197 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 198 parent = nonterminals.get(index); 199 if (parent == null) { 200 if (index == 0) { 201 parent = phraseStructure.getPhraseStructureRoot(); 202 } else { 203 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 204 } 205 nonterminals.put(index,parent); 206 } 207 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 208 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 209 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 210 if (secedgecounter % 2 == 0) { 211 edgelabelSymbol.setLength(0); 212 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 213 secedgecounter++; 214 } else { 215 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 216 if (index == 0) { 217 parent = phraseStructure.getPhraseStructureRoot(); 218 } else if (index < START_ID_OF_NONTERMINALS) { 219 parent = phraseStructure.getTokenNode(index); 220 } else { 221 parent = nonterminals.get(index); 222 if (parent == null) { 223 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 224 nonterminals.put(index,parent); 225 } 226 } 227 Edge e = phraseStructure.addSecondaryEdge(parent, child); 228 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 229 secedgecounter++; 230 } 231 } 232 start = i + 1; 233 } 234 } 235 } else { // Terminal 236 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 237 ColumnDescription column = null; 238 239 currentTerminalSize++; 240 child = syntaxGraph.addTokenNode(currentTerminalSize); 241 char[] lineChars = line.toCharArray(); 242 int start = 0; 243 int secedgecounter = 0; 244 for (int i = 0, n = lineChars.length; i < n; i++) { 245 if (lineChars[i] == '\t' && start == i) { 246 start++; 247 } else if (lineChars[i] == '\t' || i == n - 1) { 248 if (columns.hasNext()) { 249 column = columns.next(); 250 } 251 if (column.getCategory() == ColumnDescription.INPUT && child != null) { 252 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i)); 253 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { 254 edgelabelSymbol.setLength(0); 255 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 256 edgelabelTableName.setLength(0); 257 edgelabelTableName.append(column.getName()); 258 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 259 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 260 parent = nonterminals.get(index); 261 if (parent == null) { 262 if (index == 0) { 263 parent = phraseStructure.getPhraseStructureRoot(); 264 } else { 265 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 266 } 267 nonterminals.put(index,parent); 268 } 269 270 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 271 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 272 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 273 if (secedgecounter % 2 == 0) { 274 edgelabelSymbol.setLength(0); 275 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 276 secedgecounter++; 277 } else { 278 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 279 if (index == 0) { 280 parent = phraseStructure.getPhraseStructureRoot(); 281 } else if (index < START_ID_OF_NONTERMINALS) { 282 parent = phraseStructure.getTokenNode(index); 283 } else { 284 parent = nonterminals.get(index); 285 if (parent == null) { 286 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 287 nonterminals.put(index,parent); 288 } 289 } 290 Edge e = phraseStructure.addSecondaryEdge(parent, child); 291 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 292 secedgecounter++; 293 } 294 } 295 start = i + 1; 296 } 297 } 298 } 299 } else if (line.startsWith("%%")) { // comment skip 300 301 } else if (line.startsWith("#FORMAT")) { 302 // int index = line.indexOf(' '); 303 // if (index > -1) { 304 // try { 305 // formatVersion = Integer.parseInt(line.substring(index+1)); 306 // } catch (NumberFormatException e) { 307 // 308 // } 309 // } 310 } else if (line.startsWith("#BOT")) { 311 // int index = line.indexOf(' '); 312 // if (index > -1) { 313 // if (line.substring(index+1).equals("ORIGIN")) { 314 // currentHeaderTable = NegraTables.ORIGIN; 315 // } else if (line.substring(index+1).equals("EDITOR")) { 316 // currentHeaderTable = NegraTables.EDITOR; 317 // } else if (line.substring(index+1).equals("WORDTAG")) { 318 // currentHeaderTable = NegraTables.WORDTAG; 319 // } else if (line.substring(index+1).equals("MORPHTAG")) { 320 // currentHeaderTable = NegraTables.MORPHTAG; 321 // } else if (line.substring(index+1).equals("NODETAG")) { 322 // currentHeaderTable = NegraTables.NODETAG; 323 // } else if (line.substring(index+1).equals("EDGETAG")) { 324 // currentHeaderTable = NegraTables.EDGETAG; 325 // } else if (line.substring(index+1).equals("SECEDGETAG")) { 326 // currentHeaderTable = NegraTables.SECEDGETAG; 327 // } else { 328 // currentHeaderTable = NegraTables.UNDEF; 329 // } 330 // } 331 } else if (line.startsWith("#EOT")) { 332 currentHeaderTable = NegraTables.UNDEF; 333 } 334 } 335 } catch (IOException e) { 336 throw new DataFormatException("Error when reading from the input file. ", e); 337 } 338 } 339 340 public void readEpilog() throws MaltChainedException { 341 342 } 343 344 public BufferedReader getReader() { 345 return reader; 346 } 347 348 public void setReader(BufferedReader reader) { 349 this.reader = reader; 350 } 351 352 public int getSentenceCount() { 353 return sentenceCount; 354 } 355 356 public void setSentenceCount(int sentenceCount) { 357 this.sentenceCount = sentenceCount; 358 } 359 360 public int getFormatVersion() { 361 return formatVersion; 362 } 363 364 public void setFormatVersion(int formatVersion) { 365 this.formatVersion = formatVersion; 366 } 367 368 public DataFormatInstance getDataFormatInstance() { 369 return dataFormatInstance; 370 } 371 372 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 373 this.dataFormatInstance = inputDataFormatInstance; 374 } 375 376 public String getOptions() { 377 return optionString; 378 } 379 380 public void setOptions(String optionString) throws MaltChainedException { 381 this.optionString = optionString; 382 383 String[] argv; 384 try { 385 argv = optionString.split("[_\\p{Blank}]"); 386 } catch (PatternSyntaxException e) { 387 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 388 } 389 for (int i=0; i < argv.length-1; i++) { 390 if(argv[i].charAt(0) != '-') { 391 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 392 } 393 if(++i>=argv.length) { 394 throw new DataFormatException("The last argument does not have any value. "); 395 } 396 switch(argv[i-1].charAt(1)) { 397 case 's': 398 try { 399 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 400 } catch (NumberFormatException e){ 401 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 402 } 403 break; 404 default: 405 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 406 } 407 } 408 } 409 410 public String getFileName() { 411 return fileName; 412 } 413 414 public void setFileName(String fileName) { 415 this.fileName = fileName; 416 } 417 418 public URL getUrl() { 419 return url; 420 } 421 422 public void setUrl(URL url) { 423 this.url = url; 424 } 425 426 public String getCharsetName() { 427 return charsetName; 428 } 429 430 public void setCharsetName(String charsetName) { 431 this.charsetName = charsetName; 432 } 433 434 public int getNIterations() { 435 return nIterations; 436 } 437 438 public void setNIterations(int iterations) { 439 nIterations = iterations; 440 } 441 442 public int getIterationCounter() { 443 return cIterations; 444 } 445 446 public void close() throws MaltChainedException { 447 try { 448 if (reader != null) { 449 if (closeStream) { 450 reader.close(); 451 } 452 reader = null; 453 } 454 } catch (IOException e) { 455 throw new DataFormatException("Error when closing the input file.", e); 456 } 457 } 458 }