001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.syntaxgraph.DependencyStructure; 018 import org.maltparser.core.syntaxgraph.Element; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.edge.Edge; 021 /** 022 * 023 * 024 * @author Johan Hall 025 */ 026 public class TabReader implements SyntaxGraphReader { 027 private BufferedReader reader; 028 private int sentenceCount; 029 private final StringBuilder input; 030 private DataFormatInstance dataFormatInstance; 031 private static final String IGNORE_COLUMN_SIGN = "_"; 032 private static final char TAB = '\t'; 033 private static final char NEWLINE = '\n'; 034 private static final char CARRIAGE_RETURN = '\r'; 035 private String fileName = null; 036 private URL url = null; 037 private String charsetName; 038 private int nIterations; 039 private int cIterations; 040 private boolean closeStream = true; 041 042 public TabReader() { 043 input = new StringBuilder(); 044 nIterations = 1; 045 cIterations = 1; 046 } 047 048 private void reopen() throws MaltChainedException { 049 close(); 050 if (fileName != null) { 051 open(fileName, charsetName); 052 } else if (url != null) { 053 open(url, charsetName); 054 } else { 055 throw new DataFormatException("The input stream cannot be reopen. "); 056 } 057 } 058 059 public void open(String fileName, String charsetName) throws MaltChainedException { 060 setFileName(fileName); 061 setCharsetName(charsetName); 062 try { 063 open(new FileInputStream(fileName), charsetName); 064 } catch (FileNotFoundException e) { 065 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 066 } 067 } 068 069 public void open(URL url, String charsetName) throws MaltChainedException { 070 setUrl(url); 071 setCharsetName(charsetName); 072 if (url == null) { 073 throw new DataFormatException("The input file cannot be found. "); 074 } 075 try { 076 open(url.openStream(), charsetName); 077 } catch (IOException e) { 078 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 079 } 080 } 081 082 public void open(InputStream is, String charsetName) throws MaltChainedException { 083 try { 084 if (is == System.in) { 085 closeStream = false; 086 } 087 open(new InputStreamReader(is, charsetName)); 088 } catch (UnsupportedEncodingException e) { 089 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 090 } 091 } 092 093 private void open(InputStreamReader isr) throws MaltChainedException { 094 setReader(new BufferedReader(isr)); 095 setSentenceCount(0); 096 } 097 098 public void readProlog() throws MaltChainedException { 099 100 } 101 102 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 103 if (syntaxGraph == null || dataFormatInstance == null) { 104 return false; 105 } 106 107 Element node = null; 108 Edge edge = null; 109 input.setLength(0); 110 int i = 0; 111 int terminalCounter = 0; 112 int nNewLines = 0; 113 syntaxGraph.clear(); 114 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 115 while (true) { 116 int c; 117 118 try { 119 c = reader.read(); 120 } catch (IOException e) { 121 close(); 122 throw new DataFormatException("Error when reading from the input file. ", e); 123 } 124 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 125 if (input.length() != 0) { 126 if (i == 0) { 127 terminalCounter++; 128 node = syntaxGraph.addTokenNode(terminalCounter); 129 } 130 ColumnDescription column = null; 131 if (columns.hasNext()) { 132 column = columns.next(); 133 if (column.getCategory() == ColumnDescription.INPUT && node != null) { 134 syntaxGraph.addLabel(node, column.getName(), input.toString()); 135 } else if (column.getCategory() == ColumnDescription.HEAD) { 136 if (syntaxGraph instanceof DependencyStructure) { 137 if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 138 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 139 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 140 } 141 } 142 else { 143 close(); 144 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 145 } 146 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 147 //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody 148 syntaxGraph.addLabel(edge, column.getName(), input.toString()); 149 //} // bugfix 150 } 151 } 152 input.setLength(0); 153 nNewLines = 0; 154 i++; 155 } else if (c == TAB) { 156 throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. "); 157 } 158 if (c == NEWLINE) { 159 nNewLines++; 160 i = 0; 161 columns = dataFormatInstance.iterator(); 162 } 163 } else { 164 input.append((char)c); 165 } 166 167 if (nNewLines == 2 && c == NEWLINE) { 168 if (syntaxGraph.hasTokens()) { 169 sentenceCount++; 170 } 171 return true; 172 } else if (c == -1) { 173 if (syntaxGraph.hasTokens()) { 174 sentenceCount++; 175 } 176 if (cIterations < nIterations) { 177 cIterations++; 178 reopen(); 179 return true; 180 } 181 182 return false; 183 } 184 } 185 } 186 187 public void readEpilog() throws MaltChainedException { 188 189 } 190 191 public BufferedReader getReader() { 192 return reader; 193 } 194 195 public void setReader(BufferedReader reader) throws MaltChainedException { 196 close(); 197 this.reader = reader; 198 } 199 200 public DataFormatInstance getDataFormatInstance() { 201 return dataFormatInstance; 202 } 203 204 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 205 this.dataFormatInstance = dataFormatInstance; 206 } 207 208 public int getSentenceCount() throws MaltChainedException { 209 return sentenceCount; 210 } 211 212 public void setSentenceCount(int sentenceCount) { 213 this.sentenceCount = sentenceCount; 214 } 215 216 public String getOptions() { 217 return null; 218 } 219 220 public void setOptions(String optionString) throws MaltChainedException { 221 222 } 223 224 public String getFileName() { 225 return fileName; 226 } 227 228 public void setFileName(String fileName) { 229 this.fileName = fileName; 230 } 231 232 public URL getUrl() { 233 return url; 234 } 235 236 public void setUrl(URL url) { 237 this.url = url; 238 } 239 240 public String getCharsetName() { 241 return charsetName; 242 } 243 244 public void setCharsetName(String charsetName) { 245 this.charsetName = charsetName; 246 } 247 248 public int getNIterations() { 249 return nIterations; 250 } 251 252 public void setNIterations(int iterations) { 253 nIterations = iterations; 254 } 255 256 public int getIterationCounter() { 257 return cIterations; 258 } 259 260 public void close() throws MaltChainedException { 261 try { 262 if (reader != null) { 263 if (closeStream) { 264 reader.close(); 265 } 266 reader = null; 267 } 268 } catch (IOException e) { 269 throw new DataFormatException("Error when closing the input file. ", e); 270 } 271 } 272 273 public void clear() throws MaltChainedException { 274 close(); 275 input.setLength(0); 276 dataFormatInstance = null; 277 sentenceCount = 0; 278 } 279 }