utf8proc
C library for processing UTF-8 Unicode data
Data Structures | Typedefs | Enumerations | Functions | Variables
utf8proc.h File Reference
#include <stdlib.h>
#include <sys/types.h>
#include <stdbool.h>
#include <inttypes.h>
#include <limits.h>

Go to the source code of this file.

Data Structures

struct  utf8proc_property_struct
 

Macros

Error codes

Error codes being returned by almost all functions.

#define UTF8PROC_ERROR_NOMEM   -1
 
#define UTF8PROC_ERROR_OVERFLOW   -2
 
#define UTF8PROC_ERROR_INVALIDUTF8   -3
 
#define UTF8PROC_ERROR_NOTASSIGNED   -4
 
#define UTF8PROC_ERROR_INVALIDOPTS   -5
 

Typedefs

typedef int16_t utf8proc_propval_t
 
typedef struct utf8proc_property_struct utf8proc_property_t
 

Enumerations

enum  utf8proc_option_t {
  UTF8PROC_NULLTERM = (1<<0),
  UTF8PROC_STABLE = (1<<1),
  UTF8PROC_COMPAT = (1<<2),
  UTF8PROC_COMPOSE = (1<<3),
  UTF8PROC_DECOMPOSE = (1<<4),
  UTF8PROC_IGNORE = (1<<5),
  UTF8PROC_REJECTNA = (1<<6),
  UTF8PROC_NLF2LS = (1<<7),
  UTF8PROC_NLF2PS = (1<<8),
  UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
  UTF8PROC_STRIPCC = (1<<9),
  UTF8PROC_CASEFOLD = (1<<10),
  UTF8PROC_CHARBOUND = (1<<11),
  UTF8PROC_LUMP = (1<<12),
  UTF8PROC_STRIPMARK = (1<<13)
}
 
enum  utf8proc_category_t {
  UTF8PROC_CATEGORY_CN = 0,
  UTF8PROC_CATEGORY_LU = 1,
  UTF8PROC_CATEGORY_LL = 2,
  UTF8PROC_CATEGORY_LT = 3,
  UTF8PROC_CATEGORY_LM = 4,
  UTF8PROC_CATEGORY_LO = 5,
  UTF8PROC_CATEGORY_MN = 6,
  UTF8PROC_CATEGORY_MC = 7,
  UTF8PROC_CATEGORY_ME = 8,
  UTF8PROC_CATEGORY_ND = 9,
  UTF8PROC_CATEGORY_NL = 10,
  UTF8PROC_CATEGORY_NO = 11,
  UTF8PROC_CATEGORY_PC = 12,
  UTF8PROC_CATEGORY_PD = 13,
  UTF8PROC_CATEGORY_PS = 14,
  UTF8PROC_CATEGORY_PE = 15,
  UTF8PROC_CATEGORY_PI = 16,
  UTF8PROC_CATEGORY_PF = 17,
  UTF8PROC_CATEGORY_PO = 18,
  UTF8PROC_CATEGORY_SM = 19,
  UTF8PROC_CATEGORY_SC = 20,
  UTF8PROC_CATEGORY_SK = 21,
  UTF8PROC_CATEGORY_SO = 22,
  UTF8PROC_CATEGORY_ZS = 23,
  UTF8PROC_CATEGORY_ZL = 24,
  UTF8PROC_CATEGORY_ZP = 25,
  UTF8PROC_CATEGORY_CC = 26,
  UTF8PROC_CATEGORY_CF = 27,
  UTF8PROC_CATEGORY_CS = 28,
  UTF8PROC_CATEGORY_CO = 29
}
 
enum  utf8proc_bidi_class_t {
  UTF8PROC_BIDI_CLASS_L = 1,
  UTF8PROC_BIDI_CLASS_LRE = 2,
  UTF8PROC_BIDI_CLASS_LRO = 3,
  UTF8PROC_BIDI_CLASS_R = 4,
  UTF8PROC_BIDI_CLASS_AL = 5,
  UTF8PROC_BIDI_CLASS_RLE = 6,
  UTF8PROC_BIDI_CLASS_RLO = 7,
  UTF8PROC_BIDI_CLASS_PDF = 8,
  UTF8PROC_BIDI_CLASS_EN = 9,
  UTF8PROC_BIDI_CLASS_ES = 10,
  UTF8PROC_BIDI_CLASS_ET = 11,
  UTF8PROC_BIDI_CLASS_AN = 12,
  UTF8PROC_BIDI_CLASS_CS = 13,
  UTF8PROC_BIDI_CLASS_NSM = 14,
  UTF8PROC_BIDI_CLASS_BN = 15,
  UTF8PROC_BIDI_CLASS_B = 16,
  UTF8PROC_BIDI_CLASS_S = 17,
  UTF8PROC_BIDI_CLASS_WS = 18,
  UTF8PROC_BIDI_CLASS_ON = 19,
  UTF8PROC_BIDI_CLASS_LRI = 20,
  UTF8PROC_BIDI_CLASS_RLI = 21,
  UTF8PROC_BIDI_CLASS_FSI = 22,
  UTF8PROC_BIDI_CLASS_PDI = 23
}
 
enum  utf8proc_decomp_type_t {
  UTF8PROC_DECOMP_TYPE_FONT = 1,
  UTF8PROC_DECOMP_TYPE_NOBREAK = 2,
  UTF8PROC_DECOMP_TYPE_INITIAL = 3,
  UTF8PROC_DECOMP_TYPE_MEDIAL = 4,
  UTF8PROC_DECOMP_TYPE_FINAL = 5,
  UTF8PROC_DECOMP_TYPE_ISOLATED = 6,
  UTF8PROC_DECOMP_TYPE_CIRCLE = 7,
  UTF8PROC_DECOMP_TYPE_SUPER = 8,
  UTF8PROC_DECOMP_TYPE_SUB = 9,
  UTF8PROC_DECOMP_TYPE_VERTICAL = 10,
  UTF8PROC_DECOMP_TYPE_WIDE = 11,
  UTF8PROC_DECOMP_TYPE_NARROW = 12,
  UTF8PROC_DECOMP_TYPE_SMALL = 13,
  UTF8PROC_DECOMP_TYPE_SQUARE = 14,
  UTF8PROC_DECOMP_TYPE_FRACTION = 15,
  UTF8PROC_DECOMP_TYPE_COMPAT = 16
}
 
enum  utf8proc_boundclass_t {
  UTF8PROC_BOUNDCLASS_START = 0,
  UTF8PROC_BOUNDCLASS_OTHER = 1,
  UTF8PROC_BOUNDCLASS_CR = 2,
  UTF8PROC_BOUNDCLASS_LF = 3,
  UTF8PROC_BOUNDCLASS_CONTROL = 4,
  UTF8PROC_BOUNDCLASS_EXTEND = 5,
  UTF8PROC_BOUNDCLASS_L = 6,
  UTF8PROC_BOUNDCLASS_V = 7,
  UTF8PROC_BOUNDCLASS_T = 8,
  UTF8PROC_BOUNDCLASS_LV = 9,
  UTF8PROC_BOUNDCLASS_LVT = 10,
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11,
  UTF8PROC_BOUNDCLASS_SPACINGMARK = 12
}
 

Functions

const char * utf8proc_version (void)
 
const char * utf8proc_errmsg (ssize_t errcode)
 
ssize_t utf8proc_iterate (const uint8_t *str, ssize_t strlen, int32_t *codepoint_ref)
 
bool utf8proc_codepoint_valid (int32_t codepoint)
 
ssize_t utf8proc_encode_char (int32_t codepoint, uint8_t *dst)
 
const utf8proc_property_tutf8proc_get_property (int32_t codepoint)
 
ssize_t utf8proc_decompose_char (int32_t codepoint, int32_t *dst, ssize_t bufsize, utf8proc_option_t options, int *last_boundclass)
 
ssize_t utf8proc_decompose (const uint8_t *str, ssize_t strlen, int32_t *buffer, ssize_t bufsize, utf8proc_option_t options)
 
ssize_t utf8proc_reencode (int32_t *buffer, ssize_t length, utf8proc_option_t options)
 
bool utf8proc_grapheme_break (int32_t codepoint1, int32_t codepoint2)
 
int utf8proc_charwidth (int32_t codepoint)
 
utf8proc_category_t utf8proc_category (int32_t codepoint)
 
const char * utf8proc_category_string (int32_t codepoint)
 
ssize_t utf8proc_map (const uint8_t *str, ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options)
 
Normalized versions.

Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC normalized version of the null-terminated string 'str'.

uint8_t * utf8proc_NFD (const uint8_t *str)
 
uint8_t * utf8proc_NFC (const uint8_t *str)
 
uint8_t * utf8proc_NFKD (const uint8_t *str)
 
uint8_t * utf8proc_NFKC (const uint8_t *str)
 

Variables

const int8_t utf8proc_utf8class [256]
 

Macro Definition Documentation

#define UTF8PROC_ERROR_INVALIDOPTS   -5

Invalid options have been used.

#define UTF8PROC_ERROR_INVALIDUTF8   -3

The given string is not a legal UTF-8 string.

#define UTF8PROC_ERROR_NOMEM   -1

Memory could not be allocated.

#define UTF8PROC_ERROR_NOTASSIGNED   -4

The UTF8PROC_REJECTNA flag was set and an unassigned code point was found.

#define UTF8PROC_ERROR_OVERFLOW   -2

The given string is too long to be processed.

Typedef Documentation

Struct containing information about a codepoint.

typedef int16_t utf8proc_propval_t

Holds the value of a property.

Enumeration Type Documentation

Bidirectional character classes.

Enumerator
UTF8PROC_BIDI_CLASS_L 

Left-to-Right

UTF8PROC_BIDI_CLASS_LRE 

Left-to-Right Embedding

UTF8PROC_BIDI_CLASS_LRO 

Left-to-Right Override

UTF8PROC_BIDI_CLASS_R 

Right-to-Left

UTF8PROC_BIDI_CLASS_AL 

Right-to-Left Arabic

UTF8PROC_BIDI_CLASS_RLE 

Right-to-Left Embedding

UTF8PROC_BIDI_CLASS_RLO 

Right-to-Left Override

UTF8PROC_BIDI_CLASS_PDF 

Pop Directional Format

UTF8PROC_BIDI_CLASS_EN 

European Number

UTF8PROC_BIDI_CLASS_ES 

European Separator

UTF8PROC_BIDI_CLASS_ET 

European Number Terminator

UTF8PROC_BIDI_CLASS_AN 

Arabic Number

UTF8PROC_BIDI_CLASS_CS 

Common Number Separator

UTF8PROC_BIDI_CLASS_NSM 

Nonspacing Mark

UTF8PROC_BIDI_CLASS_BN 

Boundary Neutral

UTF8PROC_BIDI_CLASS_B 

Paragraph Separator

UTF8PROC_BIDI_CLASS_S 

Segment Separator

UTF8PROC_BIDI_CLASS_WS 

Whitespace

UTF8PROC_BIDI_CLASS_ON 

Other Neutrals

UTF8PROC_BIDI_CLASS_LRI 

Left-to-Right Isolate

UTF8PROC_BIDI_CLASS_RLI 

Right-to-Left Isolate

UTF8PROC_BIDI_CLASS_FSI 

First Strong Isolate

UTF8PROC_BIDI_CLASS_PDI 

Pop Directional Isolate

Boundclass property.

Enumerator
UTF8PROC_BOUNDCLASS_START 

Start

UTF8PROC_BOUNDCLASS_OTHER 

Other

UTF8PROC_BOUNDCLASS_CR 

Cr

UTF8PROC_BOUNDCLASS_LF 

Lf

UTF8PROC_BOUNDCLASS_CONTROL 

Control

UTF8PROC_BOUNDCLASS_EXTEND 

Extend

UTF8PROC_BOUNDCLASS_L 

L

UTF8PROC_BOUNDCLASS_V 

V

UTF8PROC_BOUNDCLASS_T 

T

UTF8PROC_BOUNDCLASS_LV 

Lv

UTF8PROC_BOUNDCLASS_LVT 

Lvt

UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 

Regional indicator

UTF8PROC_BOUNDCLASS_SPACINGMARK 

Spacingmark

Unicode categories.

Enumerator
UTF8PROC_CATEGORY_CN 

Other, not assigned

UTF8PROC_CATEGORY_LU 

Letter, uppercase

UTF8PROC_CATEGORY_LL 

Letter, lowercase

UTF8PROC_CATEGORY_LT 

Letter, titlecase

UTF8PROC_CATEGORY_LM 

Letter, modifier

UTF8PROC_CATEGORY_LO 

Letter, other

UTF8PROC_CATEGORY_MN 

Mark, nonspacing

UTF8PROC_CATEGORY_MC 

Mark, spacing combining

UTF8PROC_CATEGORY_ME 

Mark, enclosing

UTF8PROC_CATEGORY_ND 

Number, decimal digit

UTF8PROC_CATEGORY_NL 

Number, letter

UTF8PROC_CATEGORY_NO 

Number, other

UTF8PROC_CATEGORY_PC 

Punctuation, connector

UTF8PROC_CATEGORY_PD 

Punctuation, dash

UTF8PROC_CATEGORY_PS 

Punctuation, open

UTF8PROC_CATEGORY_PE 

Punctuation, close

UTF8PROC_CATEGORY_PI 

Punctuation, initial quote

UTF8PROC_CATEGORY_PF 

Punctuation, final quote

UTF8PROC_CATEGORY_PO 

Punctuation, other

UTF8PROC_CATEGORY_SM 

Symbol, math

UTF8PROC_CATEGORY_SC 

Symbol, currency

UTF8PROC_CATEGORY_SK 

Symbol, modifier

UTF8PROC_CATEGORY_SO 

Symbol, other

UTF8PROC_CATEGORY_ZS 

Separator, space

UTF8PROC_CATEGORY_ZL 

Separator, line

UTF8PROC_CATEGORY_ZP 

Separator, paragraph

UTF8PROC_CATEGORY_CC 

Other, control

UTF8PROC_CATEGORY_CF 

Other, format

UTF8PROC_CATEGORY_CS 

Other, surrogate

UTF8PROC_CATEGORY_CO 

Other, private use

Decomposition type.

Enumerator
UTF8PROC_DECOMP_TYPE_FONT 

Font

UTF8PROC_DECOMP_TYPE_NOBREAK 

Nobreak

UTF8PROC_DECOMP_TYPE_INITIAL 

Initial

UTF8PROC_DECOMP_TYPE_MEDIAL 

Medial

UTF8PROC_DECOMP_TYPE_FINAL 

Final

UTF8PROC_DECOMP_TYPE_ISOLATED 

Isolated

UTF8PROC_DECOMP_TYPE_CIRCLE 

Circle

UTF8PROC_DECOMP_TYPE_SUPER 

Super

UTF8PROC_DECOMP_TYPE_SUB 

Sub

UTF8PROC_DECOMP_TYPE_VERTICAL 

Vertical

UTF8PROC_DECOMP_TYPE_WIDE 

Wide

UTF8PROC_DECOMP_TYPE_NARROW 

Narrow

UTF8PROC_DECOMP_TYPE_SMALL 

Small

UTF8PROC_DECOMP_TYPE_SQUARE 

Square

UTF8PROC_DECOMP_TYPE_FRACTION 

Fraction

UTF8PROC_DECOMP_TYPE_COMPAT 

Compat

Option flags used by several functions in the library.

Enumerator
UTF8PROC_NULLTERM 

The given UTF-8 input is NULL terminated.

UTF8PROC_STABLE 

Unicode Versioning Stability has to be respected.

UTF8PROC_COMPAT 

Compatibility decomposition (i.e. formatting information is lost).

UTF8PROC_COMPOSE 

Return a result with decomposed characters.

UTF8PROC_DECOMPOSE 

Return a result with decomposed characters.

UTF8PROC_IGNORE 

Strip "default ignorable characters".

UTF8PROC_REJECTNA 

Return an error, if the input contains unassigned code points.

UTF8PROC_NLF2LS 

Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a line break, and should be converted to the codepoint for line separation (LS).

UTF8PROC_NLF2PS 

Indicating that NLF-sequences are representing a paragraph break, and should be converted to the codepoint for paragraph separation (PS).

UTF8PROC_NLF2LF 

Indicating that the meaning of NLF-sequences is unknown.

UTF8PROC_STRIPCC 

Strips and/or convers control characters.

NLF-sequences are transformed into space, except if one of the NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) are treated as a NLF-sequence in this case. All other control characters are simply removed.

UTF8PROC_CASEFOLD 

Performs unicode case folding, to be able to do a case-insensitive string comparison.

UTF8PROC_CHARBOUND 

Inserts 0xFF bytes at the beginning of each sequence which is representing a single grapheme cluster (see UAX#29).

UTF8PROC_LUMP 

Lumps certain characters together.

E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.

If NLF2LF is set, this includes a transformation of paragraph and line separators to ASCII line-feed (LF).

UTF8PROC_STRIPMARK 

Strips all character markings.

This includes non-spacing, spacing and enclosing (i.e. accents).

Note
This option works only with UTF8PROC_COMPOSE or UTF8PROC_DECOMPOSE

Function Documentation

utf8proc_category_t utf8proc_category ( int32_t  codepoint)

Return the Unicode category for the codepoint (one of the utf8proc_category_t constants.)

const char* utf8proc_category_string ( int32_t  codepoint)

Return the two-letter (nul-terminated) Unicode category string for the codepoint (e.g. "Lu" or "Co").

int utf8proc_charwidth ( int32_t  codepoint)

Given a codepoint, return a character width analogous to wcwidth(codepoint), except that a width of 0 is returned for non-printable codepoints instead of -1 as in wcwidth.

Note
If you want to check for particular types of non-printable characters, (analogous to isprint or iscntrl), use utf8proc_category.
bool utf8proc_codepoint_valid ( int32_t  codepoint)

Check if a codepoint is valid.

Returns
1, if the given codepoint is valid, otherwise 0.
ssize_t utf8proc_decompose ( const uint8_t *  str,
ssize_t  strlen,
int32_t *  buffer,
ssize_t  bufsize,
utf8proc_option_t  options 
)

Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8 string, and orders the decomposed sequences correctly.

If the UTF8PROC_NULLTERM flag in 'options' is set, processing will be stopped, when a NULL byte is encounted, otherwise 'strlen' bytes are processed. The result in form of unicode code points is written into the buffer being pointed to by 'buffer', having the length of 'bufsize' entries. In case of success the number of codepoints written is returned, in case of an error, a negative error code is returned.

If the number of written codepoints would be bigger than 'bufsize', the buffer (up to 'bufsize') has inpredictable data, and the needed buffer size is returned.

ssize_t utf8proc_decompose_char ( int32_t  codepoint,
int32_t *  dst,
ssize_t  bufsize,
utf8proc_option_t  options,
int *  last_boundclass 
)

Decompose a codepoint into an array of codepoints.

Parameters
codepointthe codepoint.
dstthe destination buffer.
bufsizethe size of the destination buffer.
optionsone or more of the following flags:
last_boundclassThis pointer has to point to an integer variable which is storing the last codepoint's boundary class, if the UTF8PROC_CHARBOUND option is used.
Returns
In case of success the number of codepoints written is returned, in case of an error, a negative error code is returned.
If the number of written codepoints would be bigger than 'bufsize', the buffer (up to 'bufsize') has inpredictable data, and the needed buffer size is returned.
ssize_t utf8proc_encode_char ( int32_t  codepoint,
uint8_t *  dst 
)

Encodes the codepoint as an UTF-8 string in the byte array being pointed to by 'dst'. This array has to be at least 4 bytes long.

In case of success the number of bytes written is returned, otherwise 0.

This function does not check if the codepoint is a valid unicode code point.

const char* utf8proc_errmsg ( ssize_t  errcode)

Con Returns a static error string for the given error code.

const utf8proc_property_t* utf8proc_get_property ( int32_t  codepoint)

Lookup the properties for a given codepoint.

Parameters
codepointThe codepoint.
Returns
A pointer to a (constant) struct containing information about the codepoint.
If the codepoint is not existent a pointer to a special struct is returned, where category is 0 (UTF8PROC_CATEGORY_CN).
bool utf8proc_grapheme_break ( int32_t  codepoint1,
int32_t  codepoint2 
)

Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).

ssize_t utf8proc_iterate ( const uint8_t *  str,
ssize_t  strlen,
int32_t *  codepoint_ref 
)

Reads a single codepoint from the UTF-8 sequence being pointed to by 'str'. The maximum number of bytes read is 'strlen', unless 'strlen' is negative (in which case up to 4 bytes are read).

If a valid codepoint could be read, it is stored in the variable being pointed to by 'codepoint_ref', otherwise that variable will be set to -1. In case of success the number of bytes read is returned, otherwise a negative error code is returned.

ssize_t utf8proc_map ( const uint8_t *  str,
ssize_t  strlen,
uint8_t **  dstptr,
utf8proc_option_t  options 
)

Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8 string, which is allocated dynamically, and afterwards pointed to by the pointer being pointed to by 'dstptr'.

If the UTF8PROC_NULLTERM flag in the 'options' field is set, the length is determined by a NULL terminator, otherwise the parameter 'strlen' is evaluated to determine the string length, but in any case the result will be NULL terminated (though it might contain NULL characters before). Other flags in the 'options' field are passed to the functions defined above, and regarded as described.

In case of success the length of the new string is returned, otherwise a negative error code is returned.

NOTICE: The memory of the new UTF-8 string will have been allocated with 'malloc', and has theirfore to be freed with 'free'.

ssize_t utf8proc_reencode ( int32_t *  buffer,
ssize_t  length,
utf8proc_option_t  options 
)

Reencodes the sequence of codepoints given by the pointer 'buffer' and 'length' as UTF-8.

The result is stored in the same memory area where the data is read.

Parameters
bufferthe (native-endian UTF-32) unicode codepoints to re-encode.
lengththe length (in codepoints) of the buffer.
optionsone or more of the following flags:
Returns
In case of success the length of the resulting UTF-8 string is returned, otherwise a negative error code is returned.
Warning
The amount of free space being pointed to by 'buffer', has to exceed the amount of the input data by one byte, and the entries of the array pointed to by 'str' have to be in the range of 0x0000 to 0x10FFFF, otherwise the program might crash!
const char* utf8proc_version ( void  )

Returns the version as a string.

Variable Documentation

const int8_t utf8proc_utf8class[256]

Array containing the byte lengths of a UTF-8 encoded codepoint based on the first byte.