CWB
|
#include <stdarg.h>
#include <math.h>
#include <ctype.h>
#include <sys/types.h>
#include "globals.h"
#include "endian.h"
#include "macros.h"
#include "attributes.h"
#include "special-chars.h"
#include "bitio.h"
#include "compression.h"
#include "regopt.h"
#include "cdaccess.h"
#define check_arg | ( | arg, | |
atyp, | |||
rval | |||
) |
if (arg == NULL) { \ cl_errno = CDA_ENULLATT; return rval; \ } \ else if (arg->type != atyp) { \ cl_errno = CDA_EATTTYPE; return rval; \ }
Checks an Attribute passed as a function argument for usability in that function.
(a) arg must not be NULL. (b) arg type has to be the type specified in atyp.
If these conditions are not fulfilled, the current function returns rval, and cl_errno is set.
Referenced by cl_cpos2alg2cpos_oldstyle(), cl_cpos2id(), cl_cpos2str(), cl_cpos2struc2cpos(), cl_cpos2struc_oldstyle(), cl_dynamic_call(), cl_dynamic_numargs(), cl_has_extended_alignment(), cl_id2all(), cl_id2cpos_oldstyle(), cl_id2freq(), cl_id2sort(), cl_id2str(), cl_id2strlen(), cl_idlist2cpos_oldstyle(), cl_idlist2freq(), cl_index_compressed(), cl_max_cpos(), cl_max_id(), cl_new_stream(), cl_regex2id(), cl_sequence_compressed(), cl_sort2id(), cl_str2id(), cl_struc2cpos(), cl_struc2str(), cl_struc_values(), and get_nr_of_strucs().
#define COMPRESS_DEBUG 0 |
If COMPRESS_DEBUG is set to a positive integer, get_id_at_position() will print debugging messages.
(2 prints more than 1!)
Referenced by cl_cpos2id().
typedef struct _position_stream_rec_ PositionStreamRecord |
Underlying structure for the PositionStream object.
PositionStreams are used for accessing Attributes. Each one represents a stream of corpus positions, representing positions where a given item occurs.
int cl_alg2cpos | ( | Attribute * | attribute, |
int | alg, | ||
int * | source_region_start, | ||
int * | source_region_end, | ||
int * | target_region_start, | ||
int * | target_region_end | ||
) |
Gets the corpus positions of an alignment on the given align-attribute.
Note that four corpus positions are retrieved, into the addresses given as parameters.
attribute | The align-attribute to look on. |
alg | The ID of the alignment whose positions are wanted. |
source_region_start | Location to put source corpus start position. |
source_region_end | Location to put source corpus end position. |
target_region_start | Location to put target corpus start position. |
target_region_end | Location to put target corpus end position. |
References CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.
Referenced by check_alignment_constraints(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_alg2cpos(), and printAlignedStrings().
int cl_cpos2alg | ( | Attribute * | attribute, |
int | cpos | ||
) |
Gets the id number of the alignment at the specified corpus position.
attribute | The align-attribute to look on. |
cpos | The corpus position to look at. |
References CDA_EALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), get_extended_alignment(), and TComponent::size.
Referenced by check_alignment_constraints(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_cpos2alg(), and printAlignedStrings().
int cl_cpos2alg2cpos_oldstyle | ( | Attribute * | attribute, |
int | position, | ||
int * | source_corpus_start, | ||
int * | source_corpus_end, | ||
int * | aligned_corpus_start, | ||
int * | aligned_corpus_end | ||
) |
Gets the corpus positions of an alignment on the given align-attribute.
This is for old-style alignments only: it doesn't (can't) deal with extended alignments. Depracated: use cl_alg2cpos instead (but note its parameters are not identical).
attribute | The align-attribute to look on. |
position | The corpus position {??} of the alignment whose positions are wanted. |
source_corpus_start | Location to put source corpus start position. |
source_corpus_end | Location to put source corpus end position. |
aligned_corpus_start | Location to put target corpus start position. |
aligned_corpus_end | Location to put target corpus end position. |
References ATT_ALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, check_arg, cl_errno, CompAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), and TComponent::size.
int cl_cpos2boundary | ( | Attribute * | a, |
int | cpos | ||
) |
Compares the location of a corpus position to the regions of an s-attribute.
This determines whether the specified corpus position is within a region (i.e. a structure, an instance of that s-attribute) on the given s-attribute; and/or on a boundary; or outside a region.
a | The s-attribute on which to search. |
cpos | The corpus position to look for. |
References CDA_ESTRUC, cl_cpos2struc2cpos(), cl_errno, STRUC_INSIDE, STRUC_LBOUND, and STRUC_RBOUND.
int cl_cpos2id | ( | Attribute * | attribute, |
int | position | ||
) |
Gets the integer ID of the item at the specified position on the given p-attribute.
attribute | The P-attribute to look on. |
position | The corpus position to look at. |
References _Attribute::any, ATT_POS, BSclose(), BSopen(), BSread(), BSseek(), CDA_ENODATA, CDA_EPOSORNG, CDA_OK, check_arg, cl_errno, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, COMPRESS_DEBUG, corpus, TMblob::data, TComponent::data, ensure_component(), POS_Attribute::hc, item_sequence_is_compressed, _huffman_code_descriptor::length, _huffman_code_descriptor::min_code, _Attribute::pos, _huffman_code_descriptor::symbols, _huffman_code_descriptor::symindex, SYNCHRONIZATION, POS_Attribute::this_block, and POS_Attribute::this_block_nr.
Referenced by cl_cpos2str(), compute_code_lengths(), creat_rev_corpus(), decode_check_huff(), do_cqi_cl_cpos2id(), get_group_id(), i2compare(), main(), SortSubcorpus(), and validate_revcorp().
char* cl_cpos2str | ( | Attribute * | attribute, |
int | position | ||
) |
Gets the string of the item at the specified position on the given p-attribute.
attribute | The P-attribute to look on. |
position | The corpus position to look at. |
References ATT_POS, CDA_OK, check_arg, cl_cpos2id(), cl_errno, and cl_id2str().
Referenced by decode_print_token_sequence(), do_cqi_cl_cpos2str(), print_tabulation(), SortExternally(), and SortSubcorpus().
int cl_cpos2struc | ( | Attribute * | a, |
int | cpos | ||
) |
Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.
This is a wrapper of the "old" function get_num_of_struc() that normalises it to standard return value behaviour.
a | The s-attribute on which to search. |
cpos | The corpus position to look for. |
References cl_cpos2struc_oldstyle(), and cl_errno.
Referenced by compose_kwic_line(), decode_print_surrounding_s_att_values(), decode_print_token_sequence(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_cpos2struc(), eval_constraint(), get_position_values(), and main().
int cl_cpos2struc2cpos | ( | Attribute * | attribute, |
int | position, | ||
int * | struc_start, | ||
int * | struc_end | ||
) |
Gets the start and end positions of the instance of the given S-attribute found at the specified corpus position.
This function finds one particular instance of the S-attribute, and assigns its start and end points to the locations given as arguments.
attribute | The s-attribute to search. |
position | The corpus position to search for. |
struc_start | Location for the start position of the instance. |
struc_end | Location for the end position of the instance. |
References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.
Referenced by cl_cpos2boundary(), and decode_print_token_sequence().
int cl_cpos2struc_oldstyle | ( | Attribute * | attribute, |
int | position, | ||
int * | struc_num | ||
) |
Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.
Depracated function: use cl_cpos2struc.
attribute | The s-attribute on which to search. |
position | The corpus position to look for. |
struc_num | Location where the number of the structure that is found will be put. |
References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.
Referenced by cl_cpos2struc().
int cl_delete_stream | ( | PositionStream * | ps | ) |
Deletes a PositionStream object.
References BSclose().
Referenced by compress_reversed_index(), and decompress_check_reversed_index().
int cl_dynamic_call | ( | Attribute * | attribute, |
DynCallResult * | dcr, | ||
DynCallResult * | args, | ||
int | nr_args | ||
) |
Calls a dynamic attribute.
This is the attribute access function for dynamic attributes.
attribute | The (dynamic) attribute in question. |
dcr | Location for the result (*int or *char). |
args | Location of the parameters (of *int or *char). |
nr_args | Number of parameters. |
References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_FLOAT, ATTAT_INT, ATTAT_NONE, ATTAT_PAREF, ATTAT_POS, ATTAT_STRING, ATTAT_VAR, Dynamic_Attribute::call, CDA_EARGS, CDA_OK, _DCR::charres, check_arg, cl_errno, CL_MAX_LINE_LENGTH, cl_strdup(), _Attribute::dyn, _DCR::floatres, _DCR::intres, _DynArg::next, Dynamic_Attribute::res_type, _DCR::type, _DynArg::type, and _DCR::value.
int cl_dynamic_numargs | ( | Attribute * | attribute | ) |
Count the number of arguments on a dynamic attribute's argument list.
attribute | pointer to the Attribute object to analyse; it must be a dynamic attribute. |
References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_VAR, CDA_OK, check_arg, cl_errno, _Attribute::dyn, _DynArg::next, and _DynArg::type.
void cl_error | ( | char * | message | ) |
Prints an error message, together with a string identifying the current error number.
References cl_errno, and cl_error_string().
Referenced by compress_reversed_index(), decode_print_token_sequence(), decompress_check_reversed_index(), lexdecode_print_item_info(), lexdecode_show(), and main().
char* cl_error_string | ( | int | error_num | ) |
Gets a string describing the error identified by an error number.
error_num | Error number integer (a CDA_* constant as defined in cl.h) |
References CDA_EALIGN, CDA_EARGS, CDA_EATTTYPE, CDA_EBADREGEX, CDA_EBUFFER, CDA_EFSETINV, CDA_EIDORNG, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_ENOMEM, CDA_ENOSTRING, CDA_ENULLATT, CDA_ENYI, CDA_EOTHER, CDA_EPATTERN, CDA_EPOSORNG, CDA_EREMOTE, CDA_ESTRUC, and CDA_OK.
Referenced by cl_error().
int cl_has_extended_alignment | ( | Attribute * | attribute | ) |
Checks whether an attribute's XALIGN component exists, that is, whether or not it has extended alignment.
attribute | An align-attribute. |
References ATT_ALIGN, check_arg, cl_errno, component_state(), ComponentLoaded, ComponentUnloaded, and CompXAlignData.
Referenced by cl_alg2cpos(), cl_cpos2alg(), cl_max_alg(), and describecorpus_show_statistics().
char* cl_id2all | ( | Attribute * | attribute, |
int | index, | ||
int * | freq, | ||
int * | slen | ||
) |
Gets the string of the item with the specified ID on the given p-attribute.
As well as returning the string, other information about the item is inserted into locations specified by other parameters.
attribute | The P-attribute to look on. |
index | The ID of the item to look at. |
freq | Will be set to the frequency of the item. |
slen | Will be set to the string-length of the item. |
References ATT_POS, CDA_OK, check_arg, cl_errno, get_id_frequency, get_id_string_len, and get_string_of_id.
Referenced by lexdecode_print_item_info().
int* cl_id2cpos_oldstyle | ( | Attribute * | attribute, |
int | id, | ||
int * | freq, | ||
int * | restrictor_list, | ||
int | restrictor_list_size | ||
) |
Gets all the corpus positions where the specified item is found on the given P-attribute.
The restrictor list is a set of ranges in which instances of the item MUST occur to be collected by this function. If no restrictor list is specified (i.e. restrictor_list is NULL), then ALL corpus positions where the item occurs are returned.
This restrictor list has the form of a list of ranges {start,end} of size restrictor_list_size, that is, the number of ints in this area is 2 * restrictor_list_size!!!
This function is "oldstyle" because in the "newstyle" function, there is no restrictor list. (And in fact, the newstyle function is implemented as a macro to this one with the last two arguments NULL and 0.)
attribute | The P-attribute to look on. |
id | The id of the item to look for. |
freq | The frequency of the specified item is written here. This will be 0 in the case of errors. |
restrictor_list | A list of pairs of integers specifying ranges {start,end} in the corpus |
restrictor_list_size | The number of PAIRS of ints in the restrictor list. |
References ATT_POS, BSclose(), BSopen(), BSseek(), CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_free, cl_malloc(), cl_realloc(), CompCompRF, CompCompRFX, CompRevCorpus, CompRevCorpusIdx, compute_ba(), TMblob::data, TComponent::data, ensure_component(), get_attribute_size, get_id_frequency, get_id_range, inverted_file_is_compressed, and read_golomb_code_bs().
int cl_id2freq | ( | Attribute * | attribute, |
int | id | ||
) |
Gets the frequency of an item on this attribute.
attribute | The P-attribute to look on |
id | Identifier of an item on this attribute. |
References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompCorpusFreqs, TMblob::data, TComponent::data, and ensure_component().
Referenced by cl_idlist2freq(), compress_reversed_index(), creat_rev_corpus(), create_feature_maps(), decompress_check_reversed_index(), do_cqi_cl_id2freq(), and validate_revcorp().
int cl_id2sort | ( | Attribute * | attribute, |
int | id | ||
) |
Gets the position in the Attribute's sorted wordlist index of the item with the specified ID code.
This function is NOT YET IMPLEMENTED.
attribute | The (positional) Attribute whose index is to be searched |
id | Identifier of an item on this attribute. |
References ATT_POS, CDA_ENODATA, CDA_ENYI, CDA_OK, check_arg, cl_errno, CompLexiconSrt, and ensure_component().
char* cl_id2str | ( | Attribute * | attribute, |
int | id | ||
) |
Gets the string that corresponds to the specified item on the given P-attribute.
attribute | The Attribute to look the item up on |
id | Identifier of an item on this attribute. |
References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.
Referenced by cl_cpos2str(), create_feature_maps(), do_cqi_cl_id2str(), Group_id2str(), i2compare(), main(), and scancorpus_add_key().
int cl_id2strlen | ( | Attribute * | attribute, |
int | id | ||
) |
Calculates the length of the string that corresponds to the specified item on the given P-attribute.
attribute | The (positional) Attribute to look up the item on |
id | Identifier of an item on this attribute. |
References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_EOTHER, CDA_OK, check_arg, cl_errno, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), get_string_of_id, and TComponent::size.
Referenced by create_feature_maps().
int* cl_idlist2cpos_oldstyle | ( | Attribute * | attribute, |
int * | word_ids, | ||
int | number_of_words, | ||
int | sort, | ||
int * | size_of_table, | ||
int * | restrictor_list, | ||
int | restrictor_list_size | ||
) |
Gets a list of corpus positions matching a list of ids.
This function returns an (ordered) list of all corpus positions which match one of the ids given in the list of ids. The table is allocated with malloc, so free it when you don't need any more.
The list itself is returned; its size is placed in size_of_table. This size is, of course, the same as the cumulative id frequency of the ids (because each corpus position matching one of the ids is added into the list).
BEWARE: when the id list is rather big or there are highly-frequent ids in the id list (for example, after a call to collect_matching_ids with the pattern ".*") this will give a copy of the corpus -- for which you probably don't have enough memory!!! It is therefore a good idea to call cumulative_id_frequency before and to introduce some kind of bias.
This function is DEPRACATED in favour of cl_idlist2cpos().
This function is "oldstyle" because it has the "restrictor list" parameters, which are not available through the "newstyle" function cl_idlist2cpos() (which is currently just a macro to this).
A note on the last two parameters, which are currently unused: restrictor_list is a list of integer pairs [a,b] which means that the returned value only contains positions which fall within at least one of these intervals. The list must be sorted by the start positions, and secondarily by b. restrictor_list_size is the number of integers in this list, NOT THE NUMBER OF PAIRS. WARNING: CURRENTLY UNIMPLEMENTED {NB -- this descrtiption of restrictor_list_size DOESN'T MATCH the one for get_positions(), which this function calls...
REMEMBER: this monster returns a list of corpus indices, not a list of ids.
attribute | The P-attribute we are looking in |
word_ids | A list of item ids (i.e. id codes for items on this attribute). |
number_of_words | The length of this list. |
sort | boolean: return sorted list? |
size_of_table | The size of the allocated table will be placed here. |
restrictor_list | See function description. |
restrictor_list_size | See function description. |
References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_malloc(), CompLexiconIdx, cumulative_id_frequency, ensure_component(), get_positions, intcompare(), and TComponent::size.
Referenced by get_matched_corpus_positions().
int cl_idlist2freq | ( | Attribute * | attribute, |
int * | word_ids, | ||
int | number_of_words | ||
) |
Calculates the total frequency of all items on a list of item IDs.
This function returns the sum of the word frequencies of words, which is an array of word_ids with length number_of_words.
The result is therefore the number of corpus positions which match one of the words.
attribute | P-attribute on which these items are found. |
word_ids | An array of item IDs. |
number_of_words | Length of the word_ids array. |
References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, and cl_id2freq().
Referenced by OptimizeStringConstraint().
int cl_index_compressed | ( | Attribute * | attribute | ) |
Check whether the index (inverted file) of the given P-attribute is compressed.
See comments in body of function for what counts as "compressed".
References ATT_POS, check_arg, cl_errno, CompCompRF, CompCompRFX, component_state(), ComponentLoaded, ComponentUnloaded, CompRevCorpus, and CompRevCorpusIdx.
Referenced by cl_new_stream().
int cl_max_alg | ( | Attribute * | attribute | ) |
Gets the id number of alignments on this align-attribute.
This is equal to the maximum alignment on this attribute.
attribute | An align-attribute. |
References CDA_ENODATA, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, ensure_component(), and TComponent::size.
Referenced by describecorpus_show_statistics(), and do_cqi_cl_attribute_size().
int cl_max_cpos | ( | Attribute * | attribute | ) |
Gets the maximum position on this P-attribute (ie the size of the attribute).
The result of this function is equal to the number of tokens in the attribute.
If the attribute's item sequence is compressed, this is read from the attribute's Huffman code descriptor block.
Otherwise, it is read from the size member of the Attribute's CompCorpus component.
References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompCorpus, CompHuffCodes, corpus, ensure_component(), POS_Attribute::hc, item_sequence_is_compressed, _huffman_code_descriptor::length, _Attribute::pos, and TComponent::size.
Referenced by compress_reversed_index(), compute_code_lengths(), creat_rev_corpus(), decode_check_huff(), decompress_check_reversed_index(), describecorpus_show_basic_info(), describecorpus_show_statistics(), do_cqi_cl_attribute_size(), get_matched_corpus_positions(), lexdecode_show(), main(), OptimizeStringConstraint(), Setop(), SortSubcorpus(), and validate_revcorp().
int cl_max_id | ( | Attribute * | attribute | ) |
Gets the maximum id on this P-attribute (ie the range of the attribute's ID codes).
The result of this function is equal to the number of types in this attribute.
References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexiconIdx, ensure_component(), and TComponent::size.
Referenced by compress_reversed_index(), compute_code_lengths(), creat_rev_corpus(), create_feature_maps(), decompress_check_reversed_index(), describecorpus_show_statistics(), do_cqi_cl_lexicon_size(), get_matched_corpus_positions(), lexdecode_show(), main(), and validate_revcorp().
int cl_max_struc | ( | Attribute * | a | ) |
Gets the maximum for this S-attribute (ie the size of the S-attribute).
The result of this function is equal to the number of instances of this s-attribute in the corpus.
This function works as a wrapper round cl_max_struc_oldstyle that normalises it to standard return value behaviour.
The s-attribute to evaluate.
References cl_errno, and get_nr_of_strucs().
Referenced by compose_kwic_line(), describecorpus_show_statistics(), do_cqi_cl_attribute_size(), main(), matchfirstpattern(), and scancorpus_add_key().
PositionStream cl_new_stream | ( | Attribute * | attribute, |
int | id | ||
) |
Creates a new PositionStream object.
attribute | The P-attribute to open the position stream on |
id | The id that the new PositionStream will have. This the id of an item on the specified attribute. |
References ATT_POS, _position_stream_rec_::attribute, _position_stream_rec_::b, _position_stream_rec_::base, _position_stream_rec_::bs, BSopen(), BSseek(), CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_index_compressed(), CompCompRF, CompCompRFX, CompRevCorpus, CompRevCorpusIdx, compute_ba(), TMblob::data, TComponent::data, ensure_component(), get_attribute_size, get_id_frequency, get_id_range, _position_stream_rec_::id, _position_stream_rec_::id_freq, _position_stream_rec_::is_compressed, _position_stream_rec_::last_pos, and _position_stream_rec_::nr_items.
Referenced by compress_reversed_index(), and decompress_check_reversed_index().
int cl_read_stream | ( | PositionStream | ps, |
int * | buffer, | ||
int | buffer_size | ||
) |
Reads corpus positions from a position stream to a buffer.
ps | The position stream to read. |
buffer | Location to put the resulting item positions. |
buffer_size | Maximum number of item positions to read. (Fewer will be read if fewer are available). |
References _position_stream_rec_::b, _position_stream_rec_::base, _position_stream_rec_::bs, _position_stream_rec_::id_freq, _position_stream_rec_::is_compressed, _position_stream_rec_::last_pos, _position_stream_rec_::nr_items, and read_golomb_code_bs().
Referenced by compress_reversed_index(), and decompress_check_reversed_index().
int* cl_regex2id | ( | Attribute * | attribute, |
char * | pattern, | ||
int | flags, | ||
int * | number_of_matches | ||
) |
Gets a list of the ids of those items on a given Attribute that match a particular regular-expression pattern.
The pattern is interpreted internally with the CL regex engine, q.v.
The function returns a pointer to a sequence of ints of size number_of_matches. The list is allocated with malloc(), so do a cl_free() when you don't need it any more.
attribute | The p-attribute to look on. |
pattern | String containing the pattern against which to match each item on the attribute. Note: this pattern is a regular expression, but it is passed as a string, not a CL_Regex object. The CL_Regex object is created internally. |
flags | Flags for the regular expression system via cl_new_regex. |
number_of_matches | This is set to the number of item ids found, i.e. the size of the returned buffer. |
References ATT_POS, CDA_EBADREGEX, CDA_ENODATA, CDA_OK, check_arg, cl_calloc(), cl_debug, cl_delete_regex(), cl_errno, cl_free, cl_malloc(), cl_new_regex(), cl_regex_error, cl_regex_match(), cl_regex_optimised(), cl_regopt_count_get(), cl_regopt_count_reset(), CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), _Attribute::pos, TComponent::size, and word.
Referenced by do_cqi_cl_regex2id(), get_matched_corpus_positions(), lexdecode_show(), and scancorpus_add_key().
int cl_sequence_compressed | ( | Attribute * | attribute | ) |
Checks whether the item sequence of the given P-attribute is compressed.
See comments in body of function for what counts as "compressed".
References ATT_POS, check_arg, cl_errno, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, component_state(), ComponentLoaded, ComponentUnloaded, POS_Attribute::hc, and _Attribute::pos.
int cl_sort2id | ( | Attribute * | attribute, |
int | sort_index_position | ||
) |
Gets the ID code of the item at the specified position in the Attribute's sorted wordlist index.
That is, given a sort-order position, the actual ID of the corresponding item is generated.
attribute | The (positional) Attribute whose index is to be searched. |
sort_index_position | The offset in the index where the ID code is to be found. |
References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexiconSrt, TMblob::data, TComponent::data, and ensure_component().
Referenced by lexdecode_show().
int cl_str2id | ( | Attribute * | attribute, |
char * | id_string | ||
) |
Gets the ID code that corresponds to the specified string on the given P-attribute.
attribute | The (positional) Attribute to look the string up on |
id_string | The string of an item on this attribute |
References ATT_POS, CDA_ENODATA, CDA_ENOSTRING, CDA_EOTHER, CDA_OK, check_arg, cl_errno, cl_strcmp(), CompLexicon, CompLexiconIdx, CompLexiconSrt, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.
Referenced by create_feature_maps(), do_cqi_cl_str2id(), get_corpus_positions(), and lexdecode_show().
int cl_strcmp | ( | char * | s1, |
char * | s2 | ||
) |
CL internal string comparison (uses signed char on all platforms).
Referenced by cl_set_intersection(), cl_str2id(), cl_string_list_strcmp(), and scompare().
int cl_struc2cpos | ( | Attribute * | attribute, |
int | struc_num, | ||
int * | struc_start, | ||
int * | struc_end | ||
) |
Retrieves the start-and-end corpus positions of a specified structure of the given s-attribute type.
attribute | An s-attribute. |
struc_num | The instance of that s-attribute to retrieve (i.e. the struc_num'th instance of this s-attribute in the corpus). |
struc_start | Location to put the starting corpus position. |
struc_end | Location to put the ending corpus position. |
References ATT_STRUC, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.
Referenced by align_print_line(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_struc2cpos(), eval_constraint(), get_position_values(), main(), and matchfirstpattern().
char* cl_struc2str | ( | Attribute * | attribute, |
int | struc_num | ||
) |
Gets the value that is associated with the specified instance of the given s-attribute.
attribute | An S-attribute. |
struc_num | ID of the structure whose value is wanted (ie, function gets value of struc_num'th instance of this s-attribute) |
References ATT_STRUC, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_struc_values(), CompStrucAVS, CompStrucAVX, TMblob::data, TComponent::data, ensure_component(), s_v_comp(), and TComponent::size.
Referenced by compute_grouping(), decode_print_surrounding_s_att_values(), decode_print_token_sequence(), do_cqi_cl_struc2str(), eval_constraint(), get_position_values(), main(), matchfirstpattern(), and scancorpus_add_key().
int cl_struc_values | ( | Attribute * | attribute | ) |
Checks whether this s-attribute has attribute values.
References ATT_STRUC, CDA_OK, check_arg, cl_errno, component_state(), ComponentLoaded, ComponentUnloaded, CompStrucAVS, CompStrucAVX, Struc_Attribute::has_attribute_values, and _Attribute::struc.
Referenced by cl_struc2str(), compute_grouping(), decode_print_token_sequence(), describecorpus_show_statistics(), do_cqi_corpus_structural_attribute_has_values(), do_XMLTag(), get_position_values(), main(), print_tabulation(), PrintAttributes(), PrintAttributesSimple(), and scancorpus_add_key().
int get_alignment | ( | int * | data, |
int | size, | ||
int | position | ||
) |
Gets the id number of the alignment at the specified corpus position.
For use with non-extended alignments. Requires members of the ALIGN component as arguments.
Not an exported function!
{Query:am I correct that "position" here means a cpos?? -- AH} {If I'm not, other docblocks in cdaccess also have errors}
data | The data member of a CompAlignData component. |
size | The size member of the same CompAlignData component. |
position | The corpus position to look at. |
Referenced by cl_cpos2alg(), and cl_cpos2alg2cpos_oldstyle().
int get_extended_alignment | ( | int * | data, |
int | size, | ||
int | position | ||
) |
Gets the id number of the alignment at the specified corpus position.
For use with extended alignments. Requires members of the XALIGN component as arguments.
Not an exported function!
data | The data member of a CompXAlignData component. |
size | The size member of the same CompXAlignData component. |
position | The corpus position to look at. |
References CDA_EALIGN.
Referenced by cl_cpos2alg().
int get_nr_of_strucs | ( | Attribute * | attribute, |
int * | nr_strucs | ||
) |
Gets the number of instances of an s-attribute in the corpus.
Depracated: use cl_max_struc instead.
attribute | The s-attribute to count. |
nr_strucs | The number of instances is put here. |
References ATT_STRUC, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompStrucData, ensure_component(), and TComponent::size.
Referenced by calculate_ranges(), and cl_max_struc().
int* get_previous_mark | ( | int * | data, |
int | size, | ||
int | position | ||
) |
Gets a pointer to the location where a structure is stored.
The structure (instance of an s-attribute) that is found is the one in which the specified corpus position occurs.
Non-exported function.
data | "data.data" member of an s-attribute |
size | "size" member of the same s-attribute |
position | The corpus position to look for. |
Referenced by cl_cpos2struc2cpos(), and cl_cpos2struc_oldstyle().
static int intcompare | ( | const void * | i, |
const void * | j | ||
) | [static] |
internal function for use with qsort
Referenced by cl_idlist2cpos_oldstyle().
int s_v_comp | ( | const void * | v1, |
const void * | v2 | ||
) |
A non-exported function used by cl_struc2str.
Referenced by cl_struc2str().
char* structure_value_at_position | ( | Attribute * | struc, |
int | position | ||
) |
Gets the value associated with the instance of the given s-attribute that occurs at the specified corpus position.
struc | The s-attribute to search through. |
position | The corpus position being queried. |
References get_num_of_struc, and structure_value.
Referenced by get_leaf_value(), and get_print_attribute_values().
int cl_errno |
Error number for CL: is set after access to any of various corpus-data-access functions.
Referenced by cl_alg2cpos(), cl_cpos2alg(), cl_cpos2alg2cpos_oldstyle(), cl_cpos2boundary(), cl_cpos2id(), cl_cpos2str(), cl_cpos2struc(), cl_cpos2struc2cpos(), cl_cpos2struc_oldstyle(), cl_dynamic_call(), cl_dynamic_numargs(), cl_error(), cl_has_extended_alignment(), cl_id2all(), cl_id2cpos_oldstyle(), cl_id2freq(), cl_id2sort(), cl_id2str(), cl_id2strlen(), cl_idlist2cpos_oldstyle(), cl_idlist2freq(), cl_index_compressed(), cl_make_set(), cl_max_alg(), cl_max_cpos(), cl_max_id(), cl_max_struc(), cl_new_regex(), cl_new_stream(), cl_regex2id(), cl_sequence_compressed(), cl_set_intersection(), cl_set_size(), cl_sort2id(), cl_str2id(), cl_struc2cpos(), cl_struc2str(), cl_struc_values(), compress_reversed_index(), decode_print_token_sequence(), decompress_check_reversed_index(), get_corpus_positions(), get_nr_of_strucs(), lexdecode_print_item_info(), lexdecode_show(), send_cl_error(), and Setop().