cwb-encode.c File Reference

#include <ctype.h>
#include <math.h>
#include <stdarg.h>
#include <sys/types.h>
#include <time.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include "../cl/globals.h"
#include "../cl/macros.h"
#include "../cl/storage.h"
#include "../cl/lexhash.h"
#include "../cl/endian.h"
#include "../cl/attributes.h"
#include <sys/time.h>

Data Structures

Defines

Typedefs

Functions

Variables


Define Documentation

#define FIELDSEPS   "\t\n"

Default string containing the characters that can function as field separators.

#define MAX_INPUT_LINE_LENGTH   65536

Input buffer size.

If we have XML tags with attributes, input lines can become pretty long (but there's basically just a single buffer)

#define MAXRANGES   1024

max number of s-attributes; also max number of p-attributes (-> could change this to implementation as a linked list)

Referenced by declare_range(), and parse_options().

#define POS_CORPUS   "%s/%s.corpus"

CL naming convention for P-attribute Corpus files.

Referenced by declare_wattr().

#define POS_LEX   "%s/%s.lexicon"

CL naming convention for P-attribute Lexicon files.

Referenced by declare_wattr().

#define POS_LEXIDX   "%s/%s.lexicon.idx"

CL naming convention for P-attribute Lexicon-index files.

Referenced by declare_wattr().

#define REP_CHECK_LEXHASH_SIZE   1000

nr of buckets of lexhashes used for checking duplicate errors (undeclared element and attribute names in XML tags)

Referenced by declare_range(), and main().

#define STRUC_AVS   "%s/%s.avs"

CL naming convention for S-attribute AVS files.

Referenced by declare_range().

#define STRUC_AVX   "%s/%s.avx"

CL naming convention for S-attribute AVX files.

Referenced by declare_range().

#define STRUC_RNG   "%s/%s.rng"

CL naming convention for S-attribute RNG files.

Referenced by declare_range().

#define UMASK   0644

User privileges of new files (octal format).

#define UNDEF_VALUE   "__UNDEF__"

Default string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty.


Typedef Documentation

typedef struct _Range Range

Range object: represents an S-attribute being encoded.


Function Documentation

void addline ( char *  str  ) 
void close_range ( Range rng,
int  end_pos 
)

Closes a currently open instance of an S-attribute.

Parameters:
rng Pointer to the S-attribute to close.
end_pos The corpus position at which this instance closes.

References Range::avs, Range::avx, cl_free, cl_lexhash_add(), cl_lexhash_find(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), _cl_lexhash_entry::data, error(), Range::fd, _cl_lexhash_entry_data::integer, MAX_LINE_LENGTH, Range::name, Range::num, NwriteInt(), Range::offset, _cl_lexhash_entry_data::pointer, print_input_line(), silent, and Range::store_values.

Referenced by main(), and open_range().

Range* declare_range ( char *  name,
char *  directory,
int  store_values,
int  null_attribute 
)

Creates a Range object to store a specified s-attribute (and, if appropriate, does the same for children-attributes).

Parameters:
name The string from the user specifying the name of this attribute, recursion and any "attributes" of this XML element - e.g. "text:0+id"
directory The directory where the CWB data files will go.
store_values boolean: indicates whether this s-attribute was specified with -V (true) or -S (false) when the program was invoked.
null_attribute boolean: this is a null attribute, i.e. an XML element to be ignored.
Returns:
Pointer to the new Range object (which is a member of the global ranges array).

References Range::avs, Range::avx, buf, cl_calloc(), cl_free, cl_lexhash_add(), cl_lexhash_id(), cl_new_lexhash(), cl_new_string_list(), cl_strdup(), cl_string_list_append(), _cl_lexhash_entry::data, debug, Range::dir, error(), Range::fd, MAX_LINE_LENGTH, MAXRANGES, Range::name, Range::num, Range::offset, _cl_lexhash_entry_data::pointer, range_ptr, REP_CHECK_LEXHASH_SIZE, Range::store_values, STRUC_AVS, STRUC_AVX, and STRUC_RNG.

Referenced by parse_options().

int declare_wattr ( char *  name,
char *  directory,
int  nr_buckets 
)

Sets up a new p-attribute, including opening corpus, lex and index file handles.

Parameters:
name Identifier string of the p-attribute
directory Directory in which CWB data files are to be created.
nr_buckets Number of buckets in the lexhash of the new p-attribute (value passed to cl_new_lexhash() )
Returns:
Always 1.

References cl_new_lexhash(), cl_strdup(), DEFAULT_ATT_NAME, error(), WAttr::feature_set, WAttr::lh, MAX_LINE_LENGTH, WAttr::name, POS_CORPUS, POS_LEX, POS_LEXIDX, WAttr::position, and wattr_ptr.

Referenced by parse_options().

char* decode_entities ( char *  s  ) 

Decode XML entities in a string.

This function decodes pre-defined XML entities in string s. It overwrites the input string s and also returns s for convenience.

(The entities are &lt; &gt; &amp; &quot; &apos;).

If passed NULL, it will not fall over - it will just pass NULL back!

Parameters:
s A string to decode.
Returns:
The string (rewritten in situ).

Referenced by addline(), and open_range().

void error ( char *  format,
  ... 
)

Prints an error message to STDERR, automatically adding a message on the location of the error in the corpus.

Parameters:
format Format-specifying string of the error message.
... Additional arguments, printf-style.

References current_input_file, input_line, and print_input_line().

Referenced by addline(), call_dynamic_attribute(), close_range(), declare_range(), declare_wattr(), get_input_line(), main(), parse_options(), and scan_directory().

int find_range ( char *  name  ) 

Gets the index (in the ranges array) of a specified S-attribute.

See also:
ranges
Parameters:
name The S-attribute to search for.
Returns:
Index (as integer).

References range_ptr.

Referenced by main(), and parse_options().

int find_wattr ( char *  name  ) 

Gets the index (in wattrs) of the P-attribute with the given name.

See also:
wattrs
Parameters:
name The P-attribute to search for.
Returns:
Index (as integer).

References wattr_ptr.

Referenced by parse_options().

int get_input_line ( char *  buffer,
int  bufsize 
)

Reads one input line into the specified buffer (either from stdin, or from one or more input files).

The input files are not passed to the function, but are taken form the program global variables.

This function returns False when the last input file has been completely read, and automatically closes files.

Parameters:
buffer Where to load the line to.
bufsize Not currently used, but should be MAX_INPUT_LINE_LENGTH in case of future use!
Returns:
boolean: true for all OK, false for a problem.

References cl_string_list_get(), current_input_file, current_input_file_name, error(), input_fd, input_file_is_pipe, input_line, MAX_INPUT_LINE_LENGTH, MAX_LINE_LENGTH, and nr_input_files.

Referenced by main().

int main ( int  argc,
char **  argv 
)

Main function for cwb-encode.

As well as the entry point to the program, this contains the main loop for each line of the corpus to be encoded.

The string of each line is sent to one of a number of different functions, depending on what is found in that string!

Parameters:
argc Number of command-line arguments.
argv Command-line arguments.

References addline(), Range::avs, Range::avx, buf, cl_free, cl_lexhash_add(), cl_lexhash_freq(), cl_malloc(), cl_new_lexhash(), cl_new_string_list(), cl_set_debug_level(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), close_range(), corpus_character_set, debug, directory, error(), Range::fd, find_range(), get_input_line(), input_line, line, MAX_INPUT_LINE_LENGTH, Range::name, nr_input_files, open_range(), parse_options(), print_input_line(), print_range_registry_line(), printtime(), progname, quote_file_path(), range_ptr, registry_file, REP_CHECK_LEXHASH_SIZE, silent, skip_empty_lines, Range::store_values, strip_blanks, verbose, wattr_ptr, and xml_aware.

char* my_strtok ( register char *  s,
register const char *  delim 
)

A replacement for the strtok() function which doesn't skip empty fields.

Parameters:
s The string to split.
delim Delimiters to use in splitting.
Returns:
The next token from the string.

References last.

Referenced by addline().

void open_range ( Range rng,
int  start_pos,
char *  annot 
)

Opens an instance of the given S-attribute.

If rng has element attribute children, open_range() will mess around with the string annotation (otherwise not).

Parameters:
rng The S-attribute to open.
start_pos The corpus position at which this instance begins.
annot The annotation string (the XML element's att-val pairs).

References cl_free, cl_lexhash_add(), cl_lexhash_find(), cl_lexhash_freq(), cl_make_set(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), close_range(), _cl_lexhash_entry::data, decode_entities(), _cl_lexhash_entry_data::integer, line, Range::name, _cl_lexhash_entry_data::pointer, print_input_line(), silent, Range::store_values, and strip_blanks.

Referenced by main(), and write_region_to_disk().

void parse_options ( int  argc,
char **  argv 
)
void print_input_line ( void   ) 

Prints the input line (and input file, if applicable) on STDERR, for error messages and warnings.

References current_input_file_name, input_line, and nr_input_files.

Referenced by addline(), close_range(), error(), main(), and open_range().

void print_range_registry_line ( Range rng,
FILE *  fd,
int  print_comment 
)

Prints registry lines for a given s-attribute, and its children, if any, to the specified file handle.

Parameters:
rng The s-attribute in question.
fd File handle for the registry file.
print_comment Boolean: if true, a comment on the original XML tags is printed.

References cl_lexhash_find(), cl_string_list_get(), cl_string_list_size(), _cl_lexhash_entry::data, Range::name, _cl_lexhash_entry_data::pointer, and Range::store_values.

Referenced by main().

void printtime ( FILE *  stream,
char *  msg 
)

Prints a message plus the current time to the specified file/stream.

Parameters:
stream Stream to print to.
msg Message to incorporate into the string that is printed.

Referenced by main().

char* quote_file_path ( char *  path  ) 

Add quotes and escape slashes to a path name if necessary.

This is for the HOME and INFO fields of the registry file.

For consistency, this function always returns a newly allocated string, regardless of whether changes have been made.

Parameters:
path String containing the path to quotify.
Returns:
The quotified string.

References cl_malloc(), and cl_strdup().

Referenced by main().

cl_string_list scan_directory ( char *  dir  ) 

Get a list of files in a given directory.

This function only lists files with .vrt or .vrt.gz extensions, and only files identified by POSIX stat() as "regular".

Parameters:
dir Path of directory to look in.
Returns:
List of paths to files (*including* the directory name).

References cl_free, cl_malloc(), cl_new_string_list(), cl_string_list_append(), cl_string_list_qsort(), and error().

Referenced by parse_options().

void usage ( void   ) 

Prints a usage message and exits the program.

References progname, and undef_value.


Variable Documentation

char* corpus_character_set = "latin1"

character set label that is inserted into the registry file

Referenced by main(), and parse_options().

index of input file currently being processed

Referenced by error(), and get_input_line().

filename of current input file, for error messages

Referenced by get_input_line(), and print_input_line().

int debug = 0

debug mode on or off?

char* directory = NULL

corpus data directory (no longer defaults to current directory)

Referenced by main(), and parse_options().

char* field_separators = FIELDSEPS

string containing the characters that can function as field separators

Referenced by addline().

FILE* input_fd = NULL

file handle for current input file (or pipe)

so we can properly close input_fd using either fclose() or pclose()

Referenced by get_input_line().

list of input file (-f option(s))

int input_line = 0

input line number (reset for each new file) for error messages

Referenced by error(), get_input_line(), load_macro_file(), main(), and print_input_line().

int line = 0
int nr_input_files = 0

number of input files (length of list after option processing)

Referenced by get_input_line(), main(), and print_input_line().

char* progname = NULL

name of the currently running program

int range_ptr = 0
See also:
ranges

Referenced by declare_range(), find_range(), main(), and parse_options().

Range ranges[MAXRANGES]

An array for keeping track of S-attributes being encoded.

char* registry_file = NULL

if set, auto-generate registry file named {registry_file}, listing declared attributes

Referenced by main(), and parse_options().

int silent = 0

hide messages

skip empty lines when encoding?

int strip_blanks = 0

strip leading and trailing blanks from input and token annotations

lookup hash for undeclared s-attributes and s-attributes declared with -S that have annotations (which will be ignored), so warnings are issued only once

char* undef_value = UNDEF_VALUE

string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty

Referenced by addline(), parse_options(), and usage().

int verbose = 0

show progress (this is _not_ the opposite of silent!)

int wattr_ptr = 0
WAttr wattrs[MAXRANGES]

An array for keeping track of P-attributes being encoded.

int xml_aware = 0

substitute XML entities in p-attributes & ignore <? and <! lines


Generated on Sun Feb 28 18:08:04 2010 for CWB by  doxygen 1.6.1