#include <ctype.h>
#include "globals.h"
#include "special-chars.h"
#define popc | ( | s, | |||
p | ) | s[p++] |
Referenced by cl_string_latex2iso().
#define pushc | ( | s, | |||
c, | |||||
p, | |||||
m | ) | s[p++] = c; if (p>=m) goto endloop; |
Referenced by cl_string_latex2iso().
void cl_string_canonical | ( | char * | s, | |
int | flags | |||
) |
Converts a string to canonical form.
The "canonical form" of a string is for use in comparisons where case-insensitivity and/or diacritic insensitivity is desired.
Note that the string s is modified in place.
s | The string (must be Latin-1!) | |
flags | The flags that specify which conversions are required. Can be IGNORE_CASE and/or IGNORE_DIAC. |
References cl_string_maptable(), and latin1.
Referenced by cl_new_regex(), cl_regex_match(), collect_matching_ids(), main(), print_tabulation(), setup_corpus(), and SortSubcorpus().
char* cl_string_latex2iso | ( | char * | str, | |
char * | result, | |||
int | target_len | |||
) |
Converts strings with latex-style blackslash escapes for accented characters to ISO-8859-1 (Latin-1).
Syntax:
\[AaOoUus..] --> corresponding ISO 8859-1 character
octal} --> ISO 8859-1 character
str | The string to convert. | |
result | The location to put the altered string (which should be shorter, or at least no longer than, the input string. If this parameter is NULL, space is automatically allocated for the output. result is allowed to be the same as str. | |
target_len | The maximum length of the target string. If result is NULL, then this is set automatically. |
References cl_malloc(), popc, and pushc.
Referenced by cl_new_regex(), do_flagged_string(), do_SetVariableValue(), and do_XMLTag().
unsigned char* cl_string_maptable | ( | CorpusCharset | charset, | |
int | flags | |||
) |
Gets a specified character mapping table for use in regular expressions.
charset | The character set of this corpus. Currently ignored. | |
flags | The flags that specify which table is required. Can be IGNORE_CASE and/or IGNORE_DIAC. |
References IGNORE_CASE, IGNORE_DIAC, latin1_identity_tab, latin1_identity_tab_init, latin1_nocase_nodiac_tab, latin1_nocase_nodiac_tab_init, latin1_nocase_tab, and latin1_nodiac_tab.
Referenced by cl_string_canonical(), and SortSubcorpus().
unsigned char cp1251_nocase_tab[256] |
Table which translates cp-1251 (ASCII + cyrillic) characters to lowercase.
Use cl_string_maptable to access.
unsigned char latin1_identity_tab[256] |
Table with identity mapping of latin-1 characters (no flags).
Use cl_string_maptable to access.
Referenced by cl_string_maptable().
int latin1_identity_tab_init = 0 |
Referenced by cl_string_maptable().
unsigned char latin1_nocase_nodiac_tab[256] |
Table with mapping for the cd flag for latin-1 (no case, no diacritics).
Use cl_string_maptable to access.
Referenced by cl_string_maptable().
int latin1_nocase_nodiac_tab_init = 0 |
Referenced by cl_string_maptable().
unsigned char latin1_nocase_tab[256] |
Table which translates latin-1 characters to lowercase.
Use cl_string_maptable to access.
Referenced by cl_string_maptable().
unsigned char latin1_nodiac_tab[256] |
Table which translates latin-1 characters with diacritics to their [A-Za-z] "equivalents", including ß->s, þ->t.
Use cl_string_maptable to access.
Referenced by cl_string_maptable().