001 package org.maltparser.parser.guide.instance; 002 003 import java.io.BufferedReader; 004 import java.io.BufferedWriter; 005 import java.io.IOException; 006 import java.util.SortedMap; 007 008 import java.util.ArrayList; 009 import java.util.TreeMap; 010 import java.util.TreeSet; 011 import java.util.regex.Pattern; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.feature.FeatureException; 015 import org.maltparser.core.feature.FeatureVector; 016 import org.maltparser.core.feature.function.FeatureFunction; 017 import org.maltparser.core.feature.function.Modifiable; 018 import org.maltparser.core.feature.value.SingleFeatureValue; 019 import org.maltparser.core.syntaxgraph.DependencyStructure; 020 import org.maltparser.parser.guide.ClassifierGuide; 021 import org.maltparser.parser.guide.GuideException; 022 import org.maltparser.parser.guide.Model; 023 import org.maltparser.parser.history.action.SingleDecision; 024 025 /** 026 The feature divide model is used for divide the training instances into several models according to 027 a divide feature. Usually this strategy decrease the training and classification time, but can also decrease 028 the accuracy of the parser. 029 030 @author Johan Hall 031 @since 1.0 032 */ 033 public class FeatureDivideModel implements InstanceModel { 034 private Model parent; 035 private final SortedMap<Integer,AtomicModel> divideModels; 036 private FeatureVector masterFeatureVector; 037 private FeatureVector divideFeatureVector; 038 private int frequency = 0; 039 private FeatureFunction divideFeature; 040 private int divideThreshold; 041 private AtomicModel masterModel; 042 private ArrayList<Integer> divideFeatureIndexVector; 043 044 /** 045 * Constructs a feature divide model. 046 * 047 * @param features the feature vector used by the atomic model. 048 * @param parent the parent guide model. 049 * @throws MaltChainedException 050 */ 051 public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException { 052 setParent(parent); 053 setFrequency(0); 054 initSplitParam(features); 055 divideModels = new TreeMap<Integer,AtomicModel>(); 056 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { 057 masterModel = new AtomicModel(-1, masterFeatureVector, this); 058 } else if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) { 059 load(); 060 } 061 } 062 063 public void addInstance(SingleDecision decision) throws MaltChainedException { 064 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) { 065 throw new GuideException("Can only add instance during learning. "); 066 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { 067 throw new GuideException("The divide feature does not have a single value. "); 068 } 069 070 divideFeature.update(); 071 if (divideModels != null) { 072 if (!divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) { 073 divideModels.put(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), new AtomicModel(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), divideFeatureVector, this)); 074 } 075 divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).addInstance(decision); 076 } else { 077 throw new GuideException("The feature divide models cannot be found. "); 078 } 079 } 080 081 public void noMoreInstances() throws MaltChainedException { 082 // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) { 083 // throw new GuideException("Can only finish all data during learning. "); 084 // } 085 086 if (divideModels != null) { 087 divideFeature.updateCardinality(); 088 for (Integer index : divideModels.keySet()) { 089 divideModels.get(index).noMoreInstances(); 090 } 091 final TreeSet<Integer> removeSet = new TreeSet<Integer>(); 092 for (Integer index : divideModels.keySet()) { 093 if (divideModels.get(index).getFrequency() <= divideThreshold) { 094 divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector); 095 removeSet.add(index); 096 } 097 } 098 for (Integer index : removeSet) { 099 divideModels.remove(index); 100 } 101 masterModel.noMoreInstances(); 102 103 } else { 104 throw new GuideException("The feature divide models cannot be found. "); 105 } 106 } 107 108 public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { 109 // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) { 110 // throw new GuideException("Can only finish sentence during learning. "); 111 // } 112 113 if (divideModels != null) { 114 for (AtomicModel divideModel : divideModels.values()) { 115 divideModel.finalizeSentence(dependencyGraph); 116 } 117 } else { 118 throw new GuideException("The feature divide models cannot be found. "); 119 } 120 } 121 122 public boolean predict(SingleDecision decision) throws MaltChainedException { 123 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { 124 throw new GuideException("Can only predict during parsing. "); 125 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { 126 throw new GuideException("The divide feature does not have a single value. "); 127 } 128 129 //divideFeature.update(); 130 if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) { 131 return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).predict(decision); 132 } else if (masterModel != null && masterModel.getFrequency() > 0) { 133 return masterModel.predict(decision); 134 } else { 135 getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " + 136 "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" + 137 " class code '1' is used. "); 138 139 decision.addDecision(1); // default prediction 140 //classCodeTable.getEmptyKBestList().addKBestItem(1); 141 } 142 return true; 143 } 144 145 public FeatureVector predictExtract(SingleDecision decision) throws MaltChainedException { 146 return getAtomicModel().predictExtract(decision); 147 } 148 149 public FeatureVector extract() throws MaltChainedException { 150 return getAtomicModel().extract(); 151 } 152 153 private AtomicModel getAtomicModel() throws MaltChainedException { 154 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { 155 throw new GuideException("Can only predict during parsing. "); 156 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { 157 throw new GuideException("The divide feature does not have a single value. "); 158 } 159 160 if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) { 161 return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()); 162 } else if (masterModel != null && masterModel.getFrequency() > 0) { 163 return masterModel; 164 } else { 165 getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " + 166 "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" + 167 " class code '1' is used. "); 168 } 169 return null; 170 } 171 172 public void terminate() throws MaltChainedException { 173 if (divideModels != null) { 174 for (AtomicModel divideModel : divideModels.values()) { 175 divideModel.terminate(); 176 } 177 } 178 if (masterModel != null) { 179 masterModel.terminate(); 180 } 181 } 182 183 public void train() throws MaltChainedException { 184 for (AtomicModel divideModel : divideModels.values()) { 185 divideModel.train(); 186 } 187 masterModel.train(); 188 save(); 189 for (AtomicModel divideModel : divideModels.values()) { 190 divideModel.terminate(); 191 } 192 masterModel.terminate(); 193 } 194 195 /** 196 * Initialize the feature split parameters and the split feature vector and master feature vector 197 * according to the behavior strategy. 198 * 199 * @param featureVector the parent guide model's feature vector. 200 * @throws MaltChainedException 201 */ 202 protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException { 203 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null 204 || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) { 205 throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. "); 206 } 207 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null 208 || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) { 209 throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. "); 210 } 211 try { 212 final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim()+ 213 ", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() +")"; 214 divideFeature = featureVector.getFeatureModel().identifyFeature(spec); 215 } catch (FeatureException e) { 216 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") cannot be initialized. ", e); 217 } 218 if (!(divideFeature instanceof Modifiable)) { 219 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") does not implement Modifiable interface. "); 220 } 221 divideFeatureIndexVector = new ArrayList<Integer>(); 222 for (int i = 0; i < featureVector.size(); i++) { 223 if (featureVector.get(i).equals(divideFeature)) { 224 divideFeatureIndexVector.add(i); 225 } 226 } 227 228 // if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) { 229 // /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or 230 // * master model adds the divide feature in the end of the feature vector. 231 // */ 232 // masterFeatureVector = (FeatureVector)featureVector.clone(); 233 // for (Integer i : divideFeatureIndexVector) { 234 // masterFeatureVector.remove(masterFeatureVector.get(i)); 235 // } 236 // for (Integer i : divideFeatureIndexVector) { 237 // masterFeatureVector.add(featureVector.get(i)); 238 // } 239 // 240 // divideFeatureVector = (FeatureVector)featureVector.clone(); 241 // for (Integer i : divideFeatureIndexVector) { 242 // divideFeatureVector.remove(divideFeatureVector.get(i)); 243 // } 244 // } else { 245 masterFeatureVector = featureVector; 246 divideFeatureVector = (FeatureVector)featureVector.clone(); 247 for (Integer i : divideFeatureIndexVector) { 248 divideFeatureVector.remove(divideFeatureVector.get(i)); 249 } 250 // } 251 try { 252 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) { 253 divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString()); 254 } else { 255 divideThreshold = 0; 256 } 257 } catch (NumberFormatException e) { 258 throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e); 259 } 260 } 261 262 /** 263 * Saves the feature divide model settings .fsm file. 264 * 265 * @throws MaltChainedException 266 */ 267 protected void save() throws MaltChainedException { 268 try { 269 final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName()+".dsm")); 270 out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n"); 271 272 if (divideModels != null) { 273 for (AtomicModel divideModel : divideModels.values()) { 274 out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n"); 275 } 276 } 277 out.close(); 278 } catch (IOException e) { 279 throw new GuideException("Could not write to the guide model settings file '"+getModelName()+".dsm"+"', when " + 280 "saving the guide model settings to file. ", e); 281 } 282 } 283 284 /** 285 * Loads the feature divide model settings .fsm file. 286 * 287 * @throws MaltChainedException 288 */ 289 protected void load() throws MaltChainedException { 290 try { 291 final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(getModelName()+".dsm")); 292 final Pattern tabPattern = Pattern.compile("\t"); 293 while(true) { 294 String line = in.readLine(); 295 if(line == null) break; 296 String[] cols = tabPattern.split(line); 297 if (cols.length != 2) { 298 throw new GuideException(""); 299 } 300 int code = -1; 301 int freq = 0; 302 try { 303 code = Integer.parseInt(cols[0]); 304 freq = Integer.parseInt(cols[1]); 305 } catch (NumberFormatException e) { 306 throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e); 307 } 308 if (code == -1) { 309 masterModel = new AtomicModel(-1, masterFeatureVector, this); 310 masterModel.setFrequency(freq); 311 } else if (divideModels != null) { 312 divideModels.put(code, new AtomicModel(code, divideFeatureVector, this)); 313 divideModels.get(code).setFrequency(freq); 314 } 315 setFrequency(getFrequency()+freq); 316 } 317 in.close(); 318 } catch (IOException e) { 319 throw new GuideException("Could not read from the guide model settings file '"+getModelName()+".dsm"+"', when " + 320 "loading the guide model settings. ", e); 321 } 322 } 323 324 /** 325 * Returns the parent model 326 * 327 * @return the parent model 328 */ 329 public Model getParent() { 330 return parent; 331 } 332 333 public ClassifierGuide getGuide() { 334 return parent.getGuide(); 335 } 336 337 /** 338 * Sets the parent model 339 * 340 * @param parent the parent model 341 */ 342 protected void setParent(Model parent) throws MaltChainedException { 343 this.parent = parent; 344 } 345 346 347 public String getModelName() throws MaltChainedException { 348 try { 349 return parent.getModelName(); 350 } catch (NullPointerException e) { 351 throw new GuideException("The parent guide model cannot be found. ", e); 352 } 353 } 354 355 /** 356 * Returns the "sum-up" or master feature vector 357 * 358 * @return a feature vector object 359 */ 360 public FeatureVector getMasterFeatureVector() { 361 return masterFeatureVector; 362 } 363 364 /** 365 * Returns the divide feature vector 366 * 367 * @return a feature vector object 368 */ 369 public FeatureVector getDivideFeatureVector() { 370 return divideFeatureVector; 371 } 372 373 /** 374 * Returns the frequency (number of instances) 375 * 376 * @return the frequency (number of instances) 377 */ 378 public int getFrequency() { 379 return frequency; 380 } 381 382 /** 383 * Increase the frequency by 1 384 */ 385 public void increaseFrequency() { 386 if (parent instanceof InstanceModel) { 387 ((InstanceModel)parent).increaseFrequency(); 388 } 389 frequency++; 390 } 391 392 public void decreaseFrequency() { 393 if (parent instanceof InstanceModel) { 394 ((InstanceModel)parent).decreaseFrequency(); 395 } 396 frequency--; 397 } 398 399 /** 400 * Sets the frequency (number of instances) 401 * 402 * @param frequency (number of instances) 403 */ 404 protected void setFrequency(int frequency) { 405 this.frequency = frequency; 406 } 407 408 409 /* (non-Javadoc) 410 * @see java.lang.Object#toString() 411 */ 412 public String toString() { 413 final StringBuilder sb = new StringBuilder(); 414 //TODO 415 return sb.toString(); 416 } 417 }