######################################################################
#
# EPrints::Index::Tokenizer
#
######################################################################
#
#
######################################################################


=pod
=head1 NAME
B<EPrints::Index::Tokenizer> - text indexing utility methods
=head1 DESCRIPTION
This module provides utility methods for processing free text into indexable things.
=head1 METHODS
=over 4
=cut

package EPrints::Index::Tokenizer;

use Text::Unidecode qw(unidecode);

######################################################################
=pod
=item @words = EPrints::Index::Tokenizer::split_words( $session, $utext )
Splits a utf8 string into individual words. 
=cut
######################################################################

sub split_words
{
	my( $session, $utext ) = @_;

	if( ref($utext) eq "Unicode::String" )
	{
		$utext = "$utext";
	}
	else
	{
		utf8::encode($utext);
	}
	# fix malformed UTF-8 data
	$utext = Encode::decode("UTF-8", $utext, Encode::FB_DEFAULT);

	return split /[^\w']+/, $utext;
}
=item @terms = EPrints::Index::Tokenizer::split_search_value( $session, $value )
Splits and returns $value into search terms.
=cut
sub split_search_value
{
	my( $session, $value ) = @_;
	# transliterate to English
	$value = apply_mapping( $session, $value );
	return split /[^\w'\*]+/, $value;
}

######################################################################
=pod
=item $utext2 = EPrints::Index::Tokenizer::apply_mapping( $session, $utext )
Replaces certain unicode characters with ASCII equivalents and returns
the new string.
This is used before indexing words so that things like umlauts will
be ignored when searching.
=cut
######################################################################

sub apply_mapping
{
	my( $session, $utext ) = @_;

	if( ref($utext) eq "Unicode::String" )
	{
		$utext = "$utext";
		utf8::decode($utext);
	}

	$utext = unidecode( $utext );

	# This should now have little to do if unidecode has done its job.
	return join("", map {
		exists($EPrints::Index::FREETEXT_CHAR_MAPPING->{$_}) ?
		$EPrints::Index::FREETEXT_CHAR_MAPPING->{$_} :
		$_;
	} split(//, $utext));	
}

##############################################################################
# Mappings and character tables
##############################################################################

# This map is used to convert Unicode characters
# to ASCII characters below 127, in the word index.
# This means that the word Fête is indexed as 'fete' and
# "fete" or "fête" will match it.
# There's no reason mappings have to be a single character.

$EPrints::Index::FREETEXT_CHAR_MAPPING = {
		chr(0x0027) => "'",	  # '
		chr(0x00a1) => '!',	 # ¡
		chr(0x00a2) => 'c',	 # ¢
		chr(0x00a3) => 'L',	 # £
		chr(0x00a4) => 'o',	 # ¤
		chr(0x00a5) => 'Y',	 # ¥
		chr(0x00a6) => '|',	 # ¦
		chr(0x00a7) => 'S',	 # §
		chr(0x00a8) => '"',	 # ¨
		chr(0x00a9) => '(c)',   # ©
		chr(0x00aa) => 'a',	 # ª
		chr(0x00ab) => '<<',	# «
		chr(0x00ac) => '-',	 # ¬
		chr(0x00ad) => '-',	 # ­
		chr(0x00ae) => '(R)',   # ®
		chr(0x00af) => '-',	 # ¯
		chr(0x00b0) => 'o',	 # °
		chr(0x00b1) => '+-',	# ±
		chr(0x00b2) => '2',	 # ²
		chr(0x00b3) => '3',	 # ³
		chr(0x00b5) => 'u',	 # µ
		chr(0x00b6) => 'q',	 # ¶
		chr(0x00b7) => '.',	 # ·
		chr(0x00b8) => ',',	 # ¸
		chr(0x00b9) => '1',	 # ¹
		chr(0x00ba) => 'o',	 # º
		chr(0x00bb) => '>>',	# »
		chr(0x00bc) => '1/4',   # ¼
		chr(0x00bd) => '1/2',   # ½
		chr(0x00be) => '3/4',   # ¾
		chr(0x00bf) => '?',	 # ¿
		chr(0x00c0) => 'A',	 # À
		chr(0x00c1) => 'A',	 # Á
		chr(0x00c2) => 'A',	 # Â
		chr(0x00c3) => 'A',	 # Ã
		chr(0x00c4) => 'AE',	# Ä
		chr(0x00c6) => 'AE',	# Æ
		chr(0x00c7) => 'C',	 # Ç
		chr(0x00c8) => 'E',	 # È
		chr(0x00c9) => 'E',	 # É
		chr(0x00ca) => 'E',	 # Ê
		chr(0x00cb) => 'E',	 # Ë
		chr(0x00cc) => 'I',	 # Ì
		chr(0x00cd) => 'I',	 # Í
		chr(0x00ce) => 'I',	 # Î
		chr(0x00cf) => 'I',	 # Ï
		chr(0x00d0) => 'D',	 # Ð
		chr(0x00d1) => 'N',	 # Ñ
		chr(0x00d2) => 'O',	 # Ò
		chr(0x00d3) => 'O',	 # Ó
		chr(0x00d4) => 'O',	 # Ô
		chr(0x00d5) => 'O',	 # Õ
		chr(0x00d6) => 'OE',	# Ö
		chr(0x00d7) => 'x',	 # ×
		chr(0x00d8) => 'O',	 # Ø
		chr(0x00d9) => 'U',	 # Ù
		chr(0x00da) => 'U',	 # Ú
		chr(0x00db) => 'U',	 # Û
		chr(0x00dc) => 'UE',	# Ü
		chr(0x00dd) => 'Y',	 # Ý
		chr(0x00de) => 'TH',	# Þ
		chr(0x00df) => 'ss',	# ß
		chr(0x00e0) => 'a',	 # à
		chr(0x00e1) => 'a',	 # á
		chr(0x00e2) => 'a',	 # â
		chr(0x00e3) => 'a',	 # ã
		chr(0x00e4) => 'ae',	# ä
		chr(0x00e5) => 'a',	 # å
		chr(0x00e6) => 'ae',	# æ
		chr(0x00e7) => 'c',	 # ç
		chr(0x00e8) => 'e',	 # è
		chr(0x00e9) => 'e',	 # é
		chr(0x00ea) => 'e',	 # ê
		chr(0x00eb) => 'e',	 # ë
		chr(0x00ec) => 'i',	 # ì
		chr(0x00ed) => 'i',	 # í
		chr(0x00ee) => 'i',	 # î
		chr(0x00ef) => 'i',	 # ï
		chr(0x00f0) => 'd',	 # ð
		chr(0x00f1) => 'n',	 # ñ
		chr(0x00f2) => 'o',	 # ò
		chr(0x00f3) => 'o',	 # ó
		chr(0x00f4) => 'o',	 # ô
		chr(0x00f5) => 'o',	 # õ
		chr(0x00f6) => 'oe',	 # ö
		chr(0x00f7) => '/',	 # ÷
		chr(0x00f8) => 'oe',	 # ø
		chr(0x00f9) => 'u',	 # ù
		chr(0x00fa) => 'u',	 # ú
		chr(0x00fb) => 'u',	 # û
		chr(0x00fc) => 'ue',	 # ü
		chr(0x00fd) => 'y',	 # ý
		chr(0x00fe) => 'th',	# þ
		chr(0x00ff) => 'y',	 # ÿ
		chr(0x00c4) => 'AE',	# Ä
		chr(0x00C5) => 'A',	# Å
		chr(0x00c6) => 'AE',	# Æ
		chr(0x00c7) => 'C',	# Ç
		chr(0x00c8) => 'E',	# È
		chr(0x00c9) => 'E',	# É
		chr(0x00ca) => 'E',	# Ê
		chr(0x00cb) => 'E',	# Ë
		chr(0x00cc) => 'I',	# Ì
		chr(0x00cd) => 'I',	# Í
		chr(0x00ce) => 'I',	# Î
		chr(0x00cf) => 'I',	# Ï
		chr(0x00d0) => 'D',	# Ð
		chr(0x00d1) => 'N',	# Ñ
		chr(0x00d2) => 'O',	# Ò
		chr(0x00d3) => 'O',	# Ó
		chr(0x00d4) => 'O',	# Ô
		chr(0x00d5) => 'O',	# Õ
		chr(0x00d6) => 'OE',	# Ö
		chr(0x00d7) => 'x',	# ×
		chr(0x00d8) => 'O',	# Ø
		chr(0x00d9) => 'U',	# Ù
		chr(0x00da) => 'U',	# Ú
		chr(0x00db) => 'U',	# Û
		chr(0x00dc) => 'UE',	# Ü
		chr(0x00dd) => 'Y',	# Ý
		chr(0x00de) => 'TH',	# Þ
		chr(0x00df) => 'ss',	# ß
		chr(0x00e0) => 'a',	# à
		chr(0x00e1) => 'a',	# á
		chr(0x00e2) => 'a',	# â
		chr(0x00e3) => 'a',	# ã
		chr(0x00e4) => 'ae',	# ä
		chr(0x00e5) => 'a',	# å
		chr(0x00e6) => 'ae',	# æ
		chr(0x00e7) => 'c',	# ç
		chr(0x00e8) => 'e',	# è
		chr(0x00e9) => 'e',	# é
		chr(0x00ea) => 'e',	# ê
		chr(0x00eb) => 'e',	# ë
		chr(0x00ec) => 'i',	# ì
		chr(0x00ed) => 'i',	# í
		chr(0x00ee) => 'i',	# î
		chr(0x00ef) => 'i',	# ï
		chr(0x00f0) => 'd',	# ð
		chr(0x00f1) => 'n',	# ñ
		chr(0x00f2) => 'o',	# ò
		chr(0x00f3) => 'o',	# ó
		chr(0x00f4) => 'o',	# ô
		chr(0x00f5) => 'o',	# õ
		chr(0x00f6) => 'oe',	# ö
		chr(0x00f7) => '/',	# ÷
		chr(0x00f8) => 'oe',	# ø
		chr(0x00f9) => 'u',	# ù
		chr(0x00fa) => 'u',	# ú
		chr(0x00fb) => 'u',	# û
		chr(0x00fc) => 'ue',	# ü
		chr(0x00fd) => 'y',	# ý
		chr(0x00fe) => 'th',	# þ
		chr(0x00ff) => 'y',	# ÿ
		chr(0x0100) => 'A',
		chr(0x0101) => 'a',
		chr(0x0102) => 'a',
		chr(0x0103) => 'a',
		chr(0x0104) => 'A',
		chr(0x0105) => 'a',
		chr(0x0106) => 'C',
		chr(0x0107) => 'c',
		chr(0x0108) => 'C',
		chr(0x0109) => 'c',
		chr(0x010A) => 'C',
		chr(0x010B) => 'c',
		chr(0x010C) => 'C',
		chr(0x010D) => 'c',
		chr(0x010E) => 'D',
		chr(0x010F) => 'd',
		chr(0x0110) => 'D',
		chr(0x0111) => 'd',
		chr(0x0112) => 'E',
		chr(0x0113) => 'e',
		chr(0x0114) => 'E',
		chr(0x0115) => 'e',
		chr(0x0116) => 'E',
		chr(0x0117) => 'e',
		chr(0x0118) => 'E',
		chr(0x0119) => 'e',
		chr(0x011A) => 'E',
		chr(0x011B) => 'e',
		chr(0x011C) => 'G',
		chr(0x011D) => 'g',
		chr(0x011E) => 'G',
		chr(0x011F) => 'g',
		chr(0x0120) => 'G',
		chr(0x0121) => 'g',
		chr(0x0122) => 'G',
		chr(0x0123) => 'g',
		chr(0x0124) => 'H',
		chr(0x0125) => 'h',
		chr(0x0126) => 'H',
		chr(0x0127) => 'h',
		chr(0x0128) => 'I',
		chr(0x0129) => 'i',
		chr(0x012A) => 'I',
		chr(0x012B) => 'i',
		chr(0x012C) => 'I',
		chr(0x012D) => 'i',
		chr(0x012E) => 'I',
		chr(0x012F) => 'i',
		chr(0x0130) => 'I',
		chr(0x0131) => 'i',
		chr(0x0132) => 'IJ',
		chr(0x0133) => 'ij',
		chr(0x0134) => 'J',
		chr(0x0135) => 'j',
		chr(0x0136) => 'K',
		chr(0x0137) => 'k',
		chr(0x0138) => 'k',
		chr(0x0139) => 'L',
		chr(0x013A) => 'l',
		chr(0x013B) => 'L',
		chr(0x013C) => 'l',
		chr(0x013D) => 'L',
		chr(0x013E) => 'l',
		chr(0x013F) => 'L',
		chr(0x0140) => 'l',
		chr(0x0141) => 'L',
		chr(0x0142) => 'l',
		chr(0x0143) => 'N',
		chr(0x0144) => 'n',
		chr(0x0145) => 'N',
		chr(0x0146) => 'n',
		chr(0x0147) => 'N',
		chr(0x0148) => 'n',
		chr(0x0149) => 'n',
		chr(0x014A) => 'N',
		chr(0x014B) => 'n',
		chr(0x014C) => 'O',
		chr(0x014D) => 'o',
		chr(0x014E) => 'O',
		chr(0x014F) => 'o',
		chr(0x0150) => 'OE',	 # Ö
		chr(0x0151) => 'oe',	 # ö
		chr(0x0152) => 'OE',
		chr(0x0153) => 'oe',
		chr(0x0154) => 'R',
		chr(0x0155) => 'r',
		chr(0x0156) => 'R',
		chr(0x0157) => 'r',
		chr(0x0158) => 'R',
		chr(0x0159) => 'r',
		chr(0x015A) => 'S',
		chr(0x015B) => 's',
		chr(0x015C) => 'S',
		chr(0x015D) => 's',
		chr(0x015E) => 'S',
		chr(0x015F) => 's',
		chr(0x0160) => 'S',
		chr(0x0161) => 's',
		chr(0x0162) => 'T',
		chr(0x0163) => 't',
		chr(0x0164) => 'T',
		chr(0x0165) => 't',
		chr(0x0166) => 'T',
		chr(0x0167) => 't',
		chr(0x0168) => 'U',
		chr(0x0169) => 'u',
		chr(0x016A) => 'U',
		chr(0x016B) => 'u',
		chr(0x016C) => 'U',
		chr(0x016D) => 'u',
		chr(0x016E) => 'U',
		chr(0x016F) => 'u',
		chr(0x0170) => 'UE',	 # Ü
		chr(0x0171) => 'ue',	 # ü
		chr(0x0172) => 'U',
		chr(0x0173) => 'u',
		chr(0x0174) => 'W',
		chr(0x0175) => 'w',
		chr(0x0176) => 'Y',
		chr(0x0177) => 'y',
		chr(0x0178) => 'Y',
		chr(0x0179) => 'Z',
		chr(0x017A) => 'z',
		chr(0x017B) => 'Z',
		chr(0x017C) => 'z',
		chr(0x017D) => 'Z',
		chr(0x017E) => 'z',
		chr(0x0300) => '', # combining diacritical marks start
		chr(0x0301) => '',
		chr(0x0302) => '',
		chr(0x0303) => '',
		chr(0x0304) => '',
		chr(0x0305) => '',
		chr(0x0306) => '',
		chr(0x0307) => '',
		chr(0x0308) => '',
		chr(0x0309) => '',
		chr(0x030A) => '',
		chr(0x030B) => '',
		chr(0x030C) => '',
		chr(0x030D) => '',
		chr(0x030E) => '',
		chr(0x030F) => '',
		chr(0x0310) => '',
		chr(0x0311) => '',
		chr(0x0312) => '',
		chr(0x0313) => '',
		chr(0x0314) => '',
		chr(0x0315) => '',
		chr(0x0316) => '',
		chr(0x0317) => '',
		chr(0x0318) => '',
		chr(0x0319) => '',
		chr(0x031A) => '',
		chr(0x031B) => '',
		chr(0x031C) => '',
		chr(0x031D) => '',
		chr(0x031E) => '',
		chr(0x031F) => '',
		chr(0x0320) => '',
		chr(0x0321) => '',
		chr(0x0322) => '',
		chr(0x0323) => '',
		chr(0x0324) => '',
		chr(0x0325) => '',
		chr(0x0326) => '',
		chr(0x0327) => '',
		chr(0x0328) => '',
		chr(0x0329) => '',
		chr(0x032A) => '',
		chr(0x032B) => '',
		chr(0x032C) => '',
		chr(0x032D) => '',
		chr(0x032E) => '',
		chr(0x032F) => '',
		chr(0x0330) => '',
		chr(0x0331) => '',
		chr(0x0332) => '',
		chr(0x0333) => '',
		chr(0x0334) => '',
		chr(0x0335) => '',
		chr(0x0336) => '',
		chr(0x0337) => '',
		chr(0x0338) => '',
		chr(0x0339) => '',
		chr(0x033A) => '',
		chr(0x033B) => '',
		chr(0x033C) => '',
		chr(0x033D) => '',
		chr(0x033E) => '',
		chr(0x033F) => '',
		chr(0x0340) => '',
		chr(0x0341) => '',
		chr(0x0342) => '',
		chr(0x0343) => '',
		chr(0x0344) => '',
		chr(0x0345) => '',
		chr(0x0346) => '',
		chr(0x0347) => '',
		chr(0x0348) => '',
		chr(0x0349) => '',
		chr(0x034A) => '',
		chr(0x034B) => '',
		chr(0x034C) => '',
		chr(0x034D) => '',
		chr(0x034E) => '',
		chr(0x034F) => '',
		chr(0x0350) => '',
		chr(0x0351) => '',
		chr(0x0352) => '',
		chr(0x0353) => '',
		chr(0x0354) => '',
		chr(0x0355) => '',
		chr(0x0356) => '',
		chr(0x0357) => '',
		chr(0x0358) => '',
		chr(0x0359) => '',
		chr(0x035A) => '',
		chr(0x035B) => '',
		chr(0x035C) => '',
		chr(0x035D) => '',
		chr(0x035E) => '',
		chr(0x035F) => '',
		chr(0x0360) => '',
		chr(0x0361) => '',
		chr(0x0362) => '', # combining diacritical marks end
		chr(0x0391) => 'A',
		chr(0x03B1) => 'a',
		chr(0x0392) => 'B',
		chr(0x03B2) => 'b',
		chr(0x0393) => 'G',
		chr(0x03B3) => 'g',
		chr(0x0394) => 'D',
		chr(0x03B4) => 'd',
		chr(0x0395) => 'E',
		chr(0x03B5) => 'e',
		chr(0x0396) => 'Z',
		chr(0x03B6) => 'z',
		chr(0x0397) => 'E',
		chr(0x03B7) => 'e',
		chr(0x0398) => 'TH',
		chr(0x03B8) => 'th',
		chr(0x0399) => 'I',
		chr(0x03B9) => 'i',
		chr(0x039A) => 'K',
		chr(0x03BA) => 'k',
		chr(0x039B) => 'L',
		chr(0x03BB) => 'l',
		chr(0x039C) => 'M',
		chr(0x03BC) => 'm',
		chr(0x039D) => 'N',
		chr(0x03BD) => 'n',
		chr(0x039E) => 'X',
		chr(0x03BE) => 'x',
		chr(0x039F) => 'O',
		chr(0x03BF) => 'o',
		chr(0x03A0) => 'P',
		chr(0x03C0) => 'p',
		chr(0x03A1) => 'R',
		chr(0x03C1) => 'r',
		chr(0x03A3) => 'S',
		chr(0x03C3) => 's',
		chr(0x03A4) => 'T',
		chr(0x03C4) => 't',
		chr(0x03A5) => 'Y',
		chr(0x03C5) => 'y',
		chr(0x03A6) => 'Ph',
		chr(0x03C6) => 'ph',
		chr(0x03A7) => 'Ch',
		chr(0x03C7) => 'ch',
		chr(0x03A8) => 'Ps',
		chr(0x03C8) => 'ps',
		chr(0x03A9) => 'O',
		chr(0x03C9) => 'o',
		chr(0x03AA) => 'I',
		chr(0x03CA) => 'i',
		chr(0x03AB) => 'Y',
		chr(0x03CB) => 'y',
		chr(0x03AC) => 'a',
		chr(0x03AD) => 'e',
		chr(0x03AE) => 'e',
		chr(0x03AF) => 'i',
		chr(0x03B0) => 'y',
		chr(0x03CC) => 'o',
		chr(0x03CD) => 'y',
		chr(0x03CE) => 'o',
		chr(0x0386) => 'A',
		chr(0x0389) => 'E',
		chr(0x038A) => 'I',
		chr(0x038C) => 'O',
		chr(0x038E) => 'Y',
		chr(0x038F) => 'O',
		chr(0x0390) => 'i',
		chr(0x0387) => ';',
		chr(0x0363) => 'a',
		chr(0x0364) => 'e',
		chr(0x0365) => 'i',
		chr(0x0366) => 'o',
		chr(0x0367) => 'u',
		chr(0x0368) => 'c',
		chr(0x0369) => 'd',
		chr(0x036A) => 'h',
		chr(0x036B) => 'm',
		chr(0x036C) => 'r',
		chr(0x036D) => 't',
		chr(0x036E) => 'v',
		chr(0x036F) => 'x',
		chr(0x2010) => '-',
		chr(0x2011) => '-',
		chr(0x2012) => '-',
		chr(0x2013) => '-',
		chr(0x2014) => '-',
		chr(0x2019) => "'",	 # ’
		chr(0x2074) => '4',
		chr(0x2075) => '5',
		chr(0x2076) => '6',
		chr(0x2077) => '7',
		chr(0x2078) => '8',
		chr(0x2079) => '9',
		chr(0x207A) => '+',
		chr(0x207B) => '-',
		chr(0x207C) => '=',
		chr(0x207D) => '(',
		chr(0x207E) => ')',
		chr(0x2080) => '0',
		chr(0x2081) => '1',
		chr(0x2082) => '2',
		chr(0x2083) => '3',
		chr(0x2084) => '4',
		chr(0x2085) => '5',
		chr(0x2086) => '6',
		chr(0x2087) => '7',
		chr(0x2088) => '8',
		chr(0x2089) => '9',
		chr(0x208A) => '+',
		chr(0x208B) => '-',
		chr(0x208C) => '=',
		chr(0x208D) => '(',
		chr(0x208E) => ')',
		chr(0x2090) => 'a',
		chr(0x2091) => 'e',
		chr(0x2092) => 'o',
		chr(0x2093) => 'x',
		chr(0x2094) => 'e',
		chr(0x2153) => '1/3',
		chr(0x2154) => '2/3',
		chr(0x2155) => '1/5',
		chr(0x2156) => '2/5',
		chr(0x2157) => '3/5',
		chr(0x2158) => '4/5',
		chr(0x2159) => '1/6',
		chr(0x215A) => '5/6',
		chr(0x215B) => '1/8',
		chr(0x215C) => '3/8',
		chr(0x215D) => '5/8',
		chr(0x215E) => '7/8',
		chr(0x215F) => '1/',
		chr(0x2160) => 'I',
		chr(0x2161) => 'II',
		chr(0x2162) => 'III',
		chr(0x2163) => 'IV',
		chr(0x2164) => 'V',
		chr(0x2165) => 'VI',
		chr(0x2166) => 'VII',
		chr(0x2167) => 'VIII',
		chr(0x2168) => 'IX',
		chr(0x2169) => 'X',
		chr(0x216A) => 'XI',
		chr(0x216B) => 'XII',
		chr(0x216C) => 'L',
		chr(0x216D) => 'C',
		chr(0x216E) => 'D',
		chr(0x216F) => 'M',
		chr(0x2170) => 'i',
		chr(0x2171) => 'ii',
		chr(0x2172) => 'iii',
		chr(0x2173) => 'iv',
		chr(0x2174) => 'v',
		chr(0x2175) => 'vi',
		chr(0x2176) => 'vii',
		chr(0x2177) => 'viii',
		chr(0x2178) => 'ix',
		chr(0x2179) => 'x',
		chr(0x217A) => 'xi',
		chr(0x217B) => 'xii',
		chr(0x217C) => 'l',
		chr(0x217D) => 'c',
		chr(0x217E) => 'd',
		chr(0x217F) => 'm',
		chr(0x2122) => 'TM',
		chr(0x25CC) => '', # dotted circle left when removing modifier for combining diacritical marks	
		chr(0x1D400) => 'A',
		chr(0x1D401) => 'B',
		chr(0x1D402) => 'C',
		chr(0x1D403) => 'D',
		chr(0x1D404) => 'E',
		chr(0x1D405) => 'F',
		chr(0x1D406) => 'G',
		chr(0x1D407) => 'H',
		chr(0x1D408) => 'I',
		chr(0x1D409) => 'J',
		chr(0x1D40A) => 'K',
		chr(0x1D40B) => 'L',
		chr(0x1D40C) => 'M',
		chr(0x1D40D) => 'N',
		chr(0x1D40E) => 'O',
		chr(0x1D40F) => 'P',
		chr(0x1D410) => 'Q',
		chr(0x1D411) => 'R',
		chr(0x1D412) => 'S',
		chr(0x1D413) => 'T',
		chr(0x1D414) => 'U',
		chr(0x1D415) => 'V',
		chr(0x1D416) => 'W',
		chr(0x1D417) => 'X',
		chr(0x1D418) => 'Y',
		chr(0x1D419) => 'Z',
		chr(0x1D41A) => 'a',
		chr(0x1D41B) => 'b',
		chr(0x1D41C) => 'c',
		chr(0x1D41D) => 'd',
		chr(0x1D41E) => 'e',
		chr(0x1D41F) => 'f',
		chr(0x1D420) => 'g',
		chr(0x1D421) => 'h',
		chr(0x1D422) => 'i',
		chr(0x1D423) => 'j',
		chr(0x1D424) => 'k',
		chr(0x1D425) => 'l',
		chr(0x1D426) => 'm',
		chr(0x1D427) => 'n',
		chr(0x1D428) => 'o',
		chr(0x1D429) => 'p',
		chr(0x1D42A) => 'q',
		chr(0x1D42B) => 'r',
		chr(0x1D42C) => 's',
		chr(0x1D42D) => 't',
		chr(0x1D42E) => 'u',
		chr(0x1D42F) => 'v',
		chr(0x1D430) => 'w',
		chr(0x1D431) => 'x',
		chr(0x1D432) => 'y',
		chr(0x1D433) => 'z',
		chr(0x1D434) => 'A',
		chr(0x1D435) => 'B',
		chr(0x1D436) => 'C',
		chr(0x1D437) => 'D',
		chr(0x1D438) => 'E',
		chr(0x1D439) => 'F',
		chr(0x1D43A) => 'G',
		chr(0x1D43B) => 'H',
		chr(0x1D43C) => 'I',
		chr(0x1D43D) => 'J',
		chr(0x1D43E) => 'K',
		chr(0x1D43F) => 'L',
		chr(0x1D440) => 'M',
		chr(0x1D441) => 'N',
		chr(0x1D442) => 'O',
		chr(0x1D443) => 'P',
		chr(0x1D444) => 'Q',
		chr(0x1D445) => 'R',
		chr(0x1D446) => 'S',
		chr(0x1D447) => 'T',
		chr(0x1D448) => 'U',
		chr(0x1D449) => 'V',
		chr(0x1D44A) => 'W',
		chr(0x1D44B) => 'X',
		chr(0x1D44C) => 'Y',
		chr(0x1D44D) => 'Z',		
		chr(0x1D44E) => 'a',
		chr(0x1D44F) => 'b',
		chr(0x1D450) => 'c',
		chr(0x1D451) => 'd',
		chr(0x1D452) => 'e',
		chr(0x1D454) => 'f',
		chr(0x1D455) => 'g',
		chr(0x1D455) => 'h',
		chr(0x1D456) => 'i',
		chr(0x1D457) => 'j',
		chr(0x1D458) => 'k',
		chr(0x1D459) => 'l',
		chr(0x1D45A) => 'm',
		chr(0x1D45B) => 'n',
		chr(0x1D45C) => 'o',
		chr(0x1D45D) => 'p',
		chr(0x1D45E) => 'q',
		chr(0x1D45F) => 'r',
		chr(0x1D460) => 's',
		chr(0x1D461) => 't',
		chr(0x1D462) => 'u',
		chr(0x1D463) => 'v',
		chr(0x1D464) => 'w',
		chr(0x1D465) => 'x',
		chr(0x1D466) => 'y',
		chr(0x1D467) => 'z',
		chr(0x1D468) => 'A',
		chr(0x1D469) => 'B',
		chr(0x1D46A) => 'C',
		chr(0x1D46B) => 'D',
		chr(0x1D46C) => 'E',
		chr(0x1D46D) => 'F',
		chr(0x1D46E) => 'G',
		chr(0x1D46F) => 'H',
		chr(0x1D470) => 'I',
		chr(0x1D471) => 'J',
		chr(0x1D472) => 'K',
		chr(0x1D473) => 'L',
		chr(0x1D474) => 'M',
		chr(0x1D475) => 'N',
		chr(0x1D476) => 'O',
		chr(0x1D477) => 'P',
		chr(0x1D478) => 'Q',
		chr(0x1D479) => 'R',
		chr(0x1D47A) => 'S',
		chr(0x1D47B) => 'T',
		chr(0x1D47C) => 'U',
		chr(0x1D47D) => 'V',
		chr(0x1D47E) => 'W',
		chr(0x1D47F) => 'X',
		chr(0x1D480) => 'Y',
		chr(0x1D481) => 'Z',
		chr(0x1D482) => 'a',
		chr(0x1D483) => 'b',
		chr(0x1D484) => 'c',
		chr(0x1D485) => 'd',
		chr(0x1D486) => 'e',
		chr(0x1D487) => 'f',
		chr(0x1D488) => 'g',
		chr(0x1D489) => 'h',
		chr(0x1D48A) => 'i',
		chr(0x1D48B) => 'j',
		chr(0x1D48C) => 'k',
		chr(0x1D48D) => 'l',
		chr(0x1D48E) => 'm',
		chr(0x1D48F) => 'n',
		chr(0x1D490) => 'o',
		chr(0x1D491) => 'p',
		chr(0x1D492) => 'q',
		chr(0x1D493) => 'r',
		chr(0x1D494) => 's',
		chr(0x1D495) => 't',
		chr(0x1D496) => 'u',
		chr(0x1D497) => 'v',
		chr(0x1D498) => 'w',
		chr(0x1D499) => 'x',
		chr(0x1D49A) => 'y',
		chr(0x1D49B) => 'z',
		chr(0x1D49C) => 'A',
		chr(0x1D49D) => 'B',
		chr(0x1D49E) => 'C',
		chr(0x1D49F) => 'D',
		chr(0x1D4A0) => 'E',
		chr(0x1D4A1) => 'F',
		chr(0x1D4A2) => 'G',
		chr(0x1D4A3) => 'H',
		chr(0x1D4A4) => 'I',
		chr(0x1D4A5) => 'J',
		chr(0x1D4A6) => 'K',
		chr(0x1D4A7) => 'L',
		chr(0x1D4A8) => 'M',
		chr(0x1D4A9) => 'N',
		chr(0x1D4AA) => 'O',
		chr(0x1D4AB) => 'P',
		chr(0x1D4AC) => 'Q',
		chr(0x1D4AD) => 'R',
		chr(0x1D4AE) => 'S',
		chr(0x1D4AF) => 'T',
		chr(0x1D4B0) => 'U',
		chr(0x1D4B1) => 'V',
		chr(0x1D4B2) => 'W',
		chr(0x1D4B3) => 'X',
		chr(0x1D4B4) => 'Y',
		chr(0x1D4B5) => 'Z',
		chr(0x1D4B6) => 'a',
		chr(0x1D4B7) => 'b',
		chr(0x1D4B8) => 'c',
		chr(0x1D4B9) => 'd',
		chr(0x1D4BA) => 'e',
		chr(0x1D4BB) => 'f',
		chr(0x1D4BC) => 'g',
		chr(0x1D4BD) => 'h',
		chr(0x1D4BE) => 'i',
		chr(0x1D4BF) => 'j',
		chr(0x1D4C0) => 'k',
		chr(0x1D4C1) => 'l',
		chr(0x1D4C2) => 'm',
		chr(0x1D4C3) => 'n',
		chr(0x1D4C4) => 'o',
		chr(0x1D4C5) => 'p',
		chr(0x1D4C6) => 'q',
		chr(0x1D4C7) => 'r',
		chr(0x1D4C8) => 's',
		chr(0x1D4C9) => 't',
		chr(0x1D4CA) => 'u',
		chr(0x1D4CB) => 'v',
		chr(0x1D4CC) => 'w',
		chr(0x1D4CD) => 'x',
		chr(0x1D4CE) => 'y',
		chr(0x1D4CF) => 'z',
		chr(0x1D4D0) => 'A',
		chr(0x1D4D1) => 'B',
		chr(0x1D4D2) => 'C',
		chr(0x1D4D3) => 'D',
		chr(0x1D4D4) => 'E',
		chr(0x1D4D5) => 'F',
		chr(0x1D4D6) => 'G',
		chr(0x1D4D7) => 'H',
		chr(0x1D4D8) => 'I',
		chr(0x1D4D9) => 'J',
		chr(0x1D4DA) => 'K',
		chr(0x1D4DB) => 'L',
		chr(0x1D4DC) => 'M',
		chr(0x1D4DD) => 'N',
		chr(0x1D4DE) => 'O',
		chr(0x1D4DF) => 'P',
		chr(0x1D4E0) => 'Q',
		chr(0x1D4E1) => 'R',
		chr(0x1D4E2) => 'S',
		chr(0x1D4E3) => 'T',
		chr(0x1D4E4) => 'U',
		chr(0x1D4E5) => 'V',
		chr(0x1D4E6) => 'W',
		chr(0x1D4E7) => 'X',
		chr(0x1D4E8) => 'Y',
		chr(0x1D4E9) => 'Z',
		chr(0x1D4EA) => 'a',
		chr(0x1D4EB) => 'b',
		chr(0x1D4EC) => 'c',
		chr(0x1D4ED) => 'd',
		chr(0x1D4EE) => 'e',
		chr(0x1D4EF) => 'f',
		chr(0x1D4F0) => 'g',
		chr(0x1D4F1) => 'h',
		chr(0x1D4F2) => 'i',
		chr(0x1D4F3) => 'j',
		chr(0x1D4F4) => 'k',
		chr(0x1D4F5) => 'l',
		chr(0x1D4F6) => 'm',
		chr(0x1D4F7) => 'n',
		chr(0x1D4F8) => 'o',
		chr(0x1D4F9) => 'p',
		chr(0x1D4FA) => 'q',
		chr(0x1D4FB) => 'r',
		chr(0x1D4FC) => 's',
		chr(0x1D4FD) => 't',
		chr(0x1D4FE) => 'u',
		chr(0x1D4FF) => 'v',
		chr(0x1D500) => 'w',
		chr(0x1D501) => 'x',
		chr(0x1D502) => 'y',
		chr(0x1D503) => 'z',
		chr(0x1D504) => 'A',
		chr(0x1D505) => 'B',
		chr(0x1D506) => 'C',
		chr(0x1D507) => 'D',
		chr(0x1D508) => 'E',
		chr(0x1D509) => 'F',
		chr(0x1D50A) => 'G',
		chr(0x1D50B) => 'H',
		chr(0x1D50C) => 'I',
		chr(0x1D50D) => 'J',
		chr(0x1D50E) => 'K',
		chr(0x1D50F) => 'L',
		chr(0x1D510) => 'M',
		chr(0x1D511) => 'N',
		chr(0x1D512) => 'O',
		chr(0x1D513) => 'P',
		chr(0x1D514) => 'Q',
		chr(0x1D515) => 'R',
		chr(0x1D516) => 'S',
		chr(0x1D517) => 'T',
		chr(0x1D518) => 'U',
		chr(0x1D519) => 'V',
		chr(0x1D51A) => 'W',
		chr(0x1D51B) => 'X',
		chr(0x1D51C) => 'Y',
		chr(0x1D51D) => 'Z',
		chr(0x1D51E) => 'a',
		chr(0x1D51F) => 'b',
		chr(0x1D520) => 'c',
		chr(0x1D521) => 'd',
		chr(0x1D522) => 'e',
		chr(0x1D523) => 'f',
		chr(0x1D524) => 'g',
		chr(0x1D525) => 'h',
		chr(0x1D526) => 'i',
		chr(0x1D527) => 'j',
		chr(0x1D528) => 'k',
		chr(0x1D529) => 'l',
		chr(0x1D52A) => 'm',
		chr(0x1D52B) => 'n',
		chr(0x1D52C) => 'o',
		chr(0x1D52D) => 'p',
		chr(0x1D52E) => 'q',
		chr(0x1D52F) => 'r',
		chr(0x1D530) => 's',
		chr(0x1D531) => 't',
		chr(0x1D532) => 'u',
		chr(0x1D533) => 'v',
		chr(0x1D534) => 'w',
		chr(0x1D535) => 'x',
		chr(0x1D536) => 'y',
		chr(0x1D537) => 'z',
		chr(0x1D538) => 'A',
		chr(0x1D539) => 'B',
		chr(0x1D53A) => 'C',
		chr(0x1D53B) => 'D',
		chr(0x1D53C) => 'E',
		chr(0x1D53D) => 'F',
		chr(0x1D53E) => 'G',
		chr(0x1D53F) => 'H',
		chr(0x1D540) => 'I',
		chr(0x1D541) => 'J',
		chr(0x1D542) => 'K',
		chr(0x1D543) => 'L',
		chr(0x1D544) => 'M',
		chr(0x1D545) => 'N',
		chr(0x1D546) => 'O',
		chr(0x1D547) => 'P',
		chr(0x1D548) => 'Q',
		chr(0x1D549) => 'R',
		chr(0x1D54A) => 'S',
		chr(0x1D54B) => 'T',
		chr(0x1D54C) => 'U',
		chr(0x1D54D) => 'V',
		chr(0x1D54E) => 'W',
		chr(0x1D54F) => 'X',
		chr(0x1D550) => 'Y',
		chr(0x1D551) => 'Z',
		chr(0x1D552) => 'a',
		chr(0x1D553) => 'b',
		chr(0x1D554) => 'c',
		chr(0x1D555) => 'd',
		chr(0x1D556) => 'e',
		chr(0x1D557) => 'f',
		chr(0x1D558) => 'g',
		chr(0x1D559) => 'h',
		chr(0x1D55A) => 'i',
		chr(0x1D55B) => 'j',
		chr(0x1D55C) => 'k',
		chr(0x1D55D) => 'l',
		chr(0x1D55E) => 'm',
		chr(0x1D55F) => 'n',
		chr(0x1D560) => 'o',
		chr(0x1D561) => 'p',
		chr(0x1D562) => 'q',
		chr(0x1D563) => 'r',
		chr(0x1D564) => 's',
		chr(0x1D565) => 't',
		chr(0x1D566) => 'u',
		chr(0x1D567) => 'v',
		chr(0x1D568) => 'w',
		chr(0x1D569) => 'x',
		chr(0x1D56A) => 'y',
		chr(0x1D56B) => 'z',
		chr(0x1D56C) => 'A',
		chr(0x1D56D) => 'B',
		chr(0x1D56E) => 'C',
		chr(0x1D56F) => 'D',
		chr(0x1D570) => 'E',
		chr(0x1D571) => 'F',
		chr(0x1D572) => 'G',
		chr(0x1D573) => 'H',
		chr(0x1D574) => 'I',
		chr(0x1D575) => 'J',
		chr(0x1D576) => 'K',
		chr(0x1D577) => 'L',
		chr(0x1D578) => 'M',
		chr(0x1D579) => 'N',
		chr(0x1D57A) => 'O',
		chr(0x1D57B) => 'P',
		chr(0x1D57C) => 'Q',
		chr(0x1D57D) => 'R',
		chr(0x1D57E) => 'S',
		chr(0x1D57F) => 'T',
		chr(0x1D580) => 'U',
		chr(0x1D581) => 'V',
		chr(0x1D582) => 'W',
		chr(0x1D583) => 'X',
		chr(0x1D584) => 'Y',
		chr(0x1D585) => 'Z',
		chr(0x1D586) => 'a',
		chr(0x1D587) => 'b',
		chr(0x1D588) => 'c',
		chr(0x1D589) => 'd',
		chr(0x1D58A) => 'e',
		chr(0x1D58B) => 'f',
		chr(0x1D58C) => 'g',
		chr(0x1D58D) => 'h',
		chr(0x1D58E) => 'i',
		chr(0x1D59F) => 'j',
		chr(0x1D590) => 'k',
		chr(0x1D591) => 'l',
		chr(0x1D592) => 'm',
		chr(0x1D593) => 'n',
		chr(0x1D594) => 'o',
		chr(0x1D595) => 'p',
		chr(0x1D596) => 'q',
		chr(0x1D597) => 'r',
		chr(0x1D598) => 's',
		chr(0x1D599) => 't',
		chr(0x1D59A) => 'u',
		chr(0x1D59B) => 'v',
		chr(0x1D59C) => 'w',
		chr(0x1D59D) => 'x',
		chr(0x1D59E) => 'y',
		chr(0x1D59F) => 'z',
		chr(0x1D5A0) => 'A',
		chr(0x1D5A1) => 'B',
		chr(0x1D5A2) => 'C',
		chr(0x1D5A3) => 'D',
		chr(0x1D5A4) => 'E',
		chr(0x1D5A5) => 'F',
		chr(0x1D5A6) => 'G',
		chr(0x1D5A7) => 'H',
		chr(0x1D5A8) => 'I',
		chr(0x1D5A9) => 'J',
		chr(0x1D5AA) => 'K',
		chr(0x1D5AB) => 'L',
		chr(0x1D5AC) => 'M',
		chr(0x1D5AD) => 'N',
		chr(0x1D5AE) => 'O',
		chr(0x1D5AF) => 'P',
		chr(0x1D5B0) => 'Q',
		chr(0x1D5B1) => 'R',
		chr(0x1D5B2) => 'S',
		chr(0x1D5B3) => 'T',
		chr(0x1D5B4) => 'U',
		chr(0x1D5B5) => 'V',
		chr(0x1D5B6) => 'W',
		chr(0x1D5B7) => 'X',
		chr(0x1D5B8) => 'Y',
		chr(0x1D5B9) => 'Z',
		chr(0x1D5BA) => 'a',
		chr(0x1D5BB) => 'b',
		chr(0x1D5BC) => 'c',
		chr(0x1D5BD) => 'd',
		chr(0x1D5BE) => 'e',
		chr(0x1D5BF) => 'f',
		chr(0x1D5C0) => 'g',
		chr(0x1D5C1) => 'h',
		chr(0x1D5C2) => 'i',
		chr(0x1D5C3) => 'j',
		chr(0x1D5C4) => 'k',
		chr(0x1D5C5) => 'l',
		chr(0x1D5C6) => 'm',
		chr(0x1D5C7) => 'n',
		chr(0x1D5C8) => 'o',
		chr(0x1D5C9) => 'p',
		chr(0x1D5CA) => 'q',
		chr(0x1D5CB) => 'r',
		chr(0x1D5CC) => 's',
		chr(0x1D5CD) => 't',
		chr(0x1D5CE) => 'u',
		chr(0x1D5CF) => 'v',
		chr(0x1D5D0) => 'w',
		chr(0x1D5D1) => 'x',
		chr(0x1D5D2) => 'y',
		chr(0x1D5D3) => 'z',
		chr(0x1D5D4) => 'A',
		chr(0x1D5D5) => 'B',
		chr(0x1D5D6) => 'C',
		chr(0x1D5D7) => 'D',
		chr(0x1D5D8) => 'E',
		chr(0x1D5D9) => 'F',
		chr(0x1D5DA) => 'G',
		chr(0x1D5DB) => 'H',
		chr(0x1D5DC) => 'I',
		chr(0x1D5DD) => 'J',
		chr(0x1D5DE) => 'K',
		chr(0x1D5DF) => 'L',
		chr(0x1D5E0) => 'M',
		chr(0x1D5E1) => 'N',
		chr(0x1D5E2) => 'O',
		chr(0x1D5E3) => 'P',
		chr(0x1D5E4) => 'Q',
		chr(0x1D5E5) => 'R',
		chr(0x1D5E6) => 'S',
		chr(0x1D5E7) => 'T',
		chr(0x1D5E8) => 'U',
		chr(0x1D5E9) => 'V',
		chr(0x1D5EA) => 'W',
		chr(0x1D5EB) => 'X',
		chr(0x1D5EC) => 'Y',
		chr(0x1D5ED) => 'Z',
		chr(0x1D5EE) => 'a',
		chr(0x1D5EF) => 'b',
		chr(0x1D5F0) => 'c',
		chr(0x1D5F1) => 'd',
		chr(0x1D5F2) => 'e',
		chr(0x1D5F3) => 'f',
		chr(0x1D5F4) => 'g',
		chr(0x1D5F5) => 'h',
		chr(0x1D5F6) => 'i',
		chr(0x1D5F7) => 'j',
		chr(0x1D5F8) => 'k',
		chr(0x1D5F9) => 'l',
		chr(0x1D5FA) => 'm',
		chr(0x1D5FB) => 'n',
		chr(0x1D5FC) => 'o',
		chr(0x1D5FD) => 'p',
		chr(0x1D5FE) => 'q',
		chr(0x1D5FF) => 'r',
		chr(0x1D600) => 's',
		chr(0x1D601) => 't',
		chr(0x1D602) => 'u',
		chr(0x1D603) => 'v',
		chr(0x1D604) => 'w',
		chr(0x1D605) => 'x',
		chr(0x1D606) => 'y',
		chr(0x1D607) => 'z',
		chr(0x1D608) => 'A',
		chr(0x1D609) => 'B',
		chr(0x1D60A) => 'C',
		chr(0x1D60B) => 'D',
		chr(0x1D60C) => 'E',
		chr(0x1D60D) => 'F',
		chr(0x1D60E) => 'G',
		chr(0x1D60F) => 'H',
		chr(0x1D610) => 'I',
		chr(0x1D611) => 'J',
		chr(0x1D612) => 'K',
		chr(0x1D613) => 'L',
		chr(0x1D614) => 'M',
		chr(0x1D615) => 'N',
		chr(0x1D616) => 'O',
		chr(0x1D617) => 'P',
		chr(0x1D618) => 'Q',
		chr(0x1D619) => 'R',
		chr(0x1D61A) => 'S',
		chr(0x1D61B) => 'T',
		chr(0x1D61C) => 'U',
		chr(0x1D61D) => 'V',
		chr(0x1D61E) => 'W',
		chr(0x1D61F) => 'X',
		chr(0x1D620) => 'Y',
		chr(0x1D621) => 'Z',
		chr(0x1D622) => 'a',
		chr(0x1D623) => 'b',
		chr(0x1D624) => 'c',
		chr(0x1D625) => 'd',
		chr(0x1D626) => 'e',
		chr(0x1D627) => 'f',
		chr(0x1D628) => 'g',
		chr(0x1D629) => 'h',
		chr(0x1D62A) => 'i',
		chr(0x1D62B) => 'j',
		chr(0x1D62C) => 'k',
		chr(0x1D62D) => 'l',
		chr(0x1D62E) => 'm',
		chr(0x1D62F) => 'n',
		chr(0x1D630) => 'o',
		chr(0x1D631) => 'p',
		chr(0x1D632) => 'q',
		chr(0x1D633) => 'r',
		chr(0x1D634) => 's',
		chr(0x1D635) => 't',
		chr(0x1D636) => 'u',
		chr(0x1D637) => 'v',
		chr(0x1D638) => 'w',
		chr(0x1D639) => 'x',
		chr(0x1D63A) => 'y',
		chr(0x1D63B) => 'z',
		chr(0x1D63C) => 'A',
		chr(0x1D63D) => 'B',
		chr(0x1D63E) => 'C',
		chr(0x1D63F) => 'D',
		chr(0x1D640) => 'E',
		chr(0x1D641) => 'F',
		chr(0x1D642) => 'G',
		chr(0x1D643) => 'H',
		chr(0x1D644) => 'I',
		chr(0x1D645) => 'J',
		chr(0x1D646) => 'K',
		chr(0x1D647) => 'L',
		chr(0x1D648) => 'M',
		chr(0x1D649) => 'N',
		chr(0x1D64A) => 'O',
		chr(0x1D64B) => 'P',
		chr(0x1D64C) => 'Q',
		chr(0x1D64D) => 'R',
		chr(0x1D64E) => 'S',
		chr(0x1D64F) => 'T',
		chr(0x1D650) => 'U',
		chr(0x1D651) => 'V',
		chr(0x1D652) => 'W',
		chr(0x1D653) => 'X',
		chr(0x1D654) => 'Y',
		chr(0x1D655) => 'Z',
		chr(0x1D656) => 'a',
		chr(0x1D657) => 'b',
		chr(0x1D658) => 'c',
		chr(0x1D659) => 'd',
		chr(0x1D65A) => 'e',
		chr(0x1D65B) => 'f',
		chr(0x1D65C) => 'g',
		chr(0x1D65D) => 'h',
		chr(0x1D65E) => 'i',
		chr(0x1D65F) => 'j',
		chr(0x1D660) => 'k',
		chr(0x1D661) => 'l',
		chr(0x1D662) => 'm',
		chr(0x1D663) => 'n',
		chr(0x1D664) => 'o',
		chr(0x1D665) => 'p',
		chr(0x1D666) => 'q',
		chr(0x1D667) => 'r',
		chr(0x1D668) => 's',
		chr(0x1D669) => 't',
		chr(0x1D66A) => 'u',
		chr(0x1D66B) => 'v',
		chr(0x1D66C) => 'w',
		chr(0x1D66D) => 'x',
		chr(0x1D66E) => 'y',
		chr(0x1D66F) => 'z',
		chr(0x1D670) => 'A',
		chr(0x1D671) => 'B',
		chr(0x1D672) => 'C',
		chr(0x1D673) => 'D',
		chr(0x1D674) => 'E',
		chr(0x1D675) => 'F',
		chr(0x1D676) => 'G',
		chr(0x1D677) => 'H',
		chr(0x1D678) => 'I',
		chr(0x1D679) => 'J',
		chr(0x1D67A) => 'K',
		chr(0x1D67B) => 'L',
		chr(0x1D67C) => 'M',
		chr(0x1D67D) => 'N',
		chr(0x1D67E) => 'O',
		chr(0x1D67F) => 'P',
		chr(0x1D680) => 'Q',
		chr(0x1D681) => 'R',
		chr(0x1D682) => 'S',
		chr(0x1D683) => 'T',
		chr(0x1D684) => 'U',
		chr(0x1D685) => 'V',
		chr(0x1D686) => 'W',
		chr(0x1D687) => 'X',
		chr(0x1D688) => 'Y',
		chr(0x1D689) => 'Z',
		chr(0x1D68A) => 'a',
		chr(0x1D68B) => 'b',
		chr(0x1D68C) => 'c',
		chr(0x1D68D) => 'd',
		chr(0x1D68E) => 'e',
		chr(0x1D68F) => 'f',
		chr(0x1D690) => 'g',
		chr(0x1D691) => 'h',
		chr(0x1D692) => 'i',
		chr(0x1D693) => 'j',
		chr(0x1D694) => 'k',
		chr(0x1D695) => 'l',
		chr(0x1D696) => 'm',
		chr(0x1D697) => 'n',
		chr(0x1D698) => 'o',
		chr(0x1D699) => 'p',
		chr(0x1D69A) => 'q',
		chr(0x1D69B) => 'r',
		chr(0x1D69C) => 's',
		chr(0x1D69D) => 't',
		chr(0x1D69E) => 'u',
		chr(0x1D69F) => 'v',
		chr(0x1D6A0) => 'w',
		chr(0x1D6A1) => 'x',
		chr(0x1D6A2) => 'y',
		chr(0x1D6A3) => 'z',
		chr(0x1D6A4) => 'i',
		chr(0x1D6A5) => 'j',
		chr(0x1D6A6) => 'y',
		chr(0x1D6A7) => '',
		chr(0x1D6A8) => 'Α',
		chr(0x1D6A9) => 'Β',
		chr(0x1D6AA) => 'Γ',
		chr(0x1D6AB) => 'Δ',
		chr(0x1D6AC) => 'Ε',
		chr(0x1D6AD) => 'Ζ',
		chr(0x1D6AE) => 'Η',
		chr(0x1D6AF) => 'Θ',
		chr(0x1D6B0) => 'Ι',
		chr(0x1D6B1) => 'Κ',
		chr(0x1D6B2) => 'Λ',
		chr(0x1D6B3) => 'Μ',
		chr(0x1D6B4) => 'Ν',
		chr(0x1D6B5) => 'Ξ',
		chr(0x1D6B6) => 'Ο',
		chr(0x1D6B7) => 'Π',
		chr(0x1D6B8) => 'Ρ',
		chr(0x1D6B9) => 'Σ',
		chr(0x1D6BA) => 'Τ',
		chr(0x1D6BB) => 'Υ',
		chr(0x1D6BC) => 'Ε',
		chr(0x1D6BD) => 'Φ',
		chr(0x1D6BE) => 'Χ',
		chr(0x1D6BF) => 'Ψ',
		chr(0x1D6C0) => 'Ω',
		chr(0x1D6C1) => '∇',
		chr(0x1D6C2) => 'α',
		chr(0x1D6C3) => 'β',
		chr(0x1D6C4) => 'γ',
		chr(0x1D6C5) => 'δ',
		chr(0x1D6C6) => 'ε',
		chr(0x1D6C7) => 'ζ',
		chr(0x1D6C8) => 'η',
		chr(0x1D6C9) => 'θ',
		chr(0x1D6CA) => 'ι',
		chr(0x1D6CB) => 'κ',
		chr(0x1D6CC) => 'λ',
		chr(0x1D6CD) => 'μ',
		chr(0x1D6CE) => 'ν',
		chr(0x1D6CF) => 'ξ',
		chr(0x1D6D0) => 'ο',
		chr(0x1D6D1) => 'π',
		chr(0x1D6D2) => 'ρ',
		chr(0x1D6D3) => 'ς',
		chr(0x1D6D4) => 'σ',
		chr(0x1D6D5) => 'τ',
		chr(0x1D6D6) => 'υ',
		chr(0x1D6D7) => 'φ',
		chr(0x1D6D8) => 'χ',
		chr(0x1D6D9) => 'ψ',
		chr(0x1D6DA) => 'ω',
		chr(0x1D6DB) => '∂',
		chr(0x1D6DC) => 'ϵ',
		chr(0x1D6DD) => 'ϑ',
		chr(0x1D6DE) => 'κ',
		chr(0x1D6DF) => 'ϕ',
		chr(0x1D6E0) => 'ρ',
		chr(0x1D6E1) => 'ϖ',
		chr(0x1D6E2) => 'Α',
		chr(0x1D6E3) => 'Β',
		chr(0x1D6E4) => 'Γ',
		chr(0x1D6E5) => 'Δ',
		chr(0x1D6E6) => 'Ε',
		chr(0x1D6E7) => 'Ζ',
		chr(0x1D6E8) => 'Η',
		chr(0x1D6E9) => 'Θ',
		chr(0x1D6EA) => 'Ι',
		chr(0x1D6EB) => 'Κ',
		chr(0x1D6EC) => 'Λ',
		chr(0x1D6ED) => 'Μ',
		chr(0x1D6EE) => 'Ν',
		chr(0x1D6EF) => 'Ξ',
		chr(0x1D6F0) => 'Ο',
		chr(0x1D6F1) => 'Π',
		chr(0x1D6F2) => 'Ρ',
		chr(0x1D6F3) => 'θ',
		chr(0x1D6F4) => 'Σ',
		chr(0x1D6F5) => 'Τ',
		chr(0x1D6F6) => 'Υ',
		chr(0x1D6F7) => 'Φ',
		chr(0x1D6F8) => 'Χ',
		chr(0x1D6F9) => 'Ψ',
		chr(0x1D6FA) => 'Ω',
		chr(0x1D6FB) => '∇',
		chr(0x1D6FC) => 'α',
		chr(0x1D6FD) => 'β',
		chr(0x1D6FE) => 'γ',
		chr(0x1D6FF) => 'δ',
		chr(0x1D700) => 'ε',
		chr(0x1D701) => 'ζ',
		chr(0x1D702) => 'η',
		chr(0x1D703) => 'θ',
		chr(0x1D704) => 'ι',
		chr(0x1D705) => 'κ',
		chr(0x1D706) => 'λ',
		chr(0x1D707) => 'μ',
		chr(0x1D708) => 'ν',
		chr(0x1D709) => 'ξ',
		chr(0x1D70A) => 'ο',
		chr(0x1D70B) => 'π',
		chr(0x1D70C) => 'ρ',
		chr(0x1D70D) => 'ς',
		chr(0x1D70E) => 'σ',
		chr(0x1D70F) => 'τ',
		chr(0x1D70F) => 'υ',
		chr(0x1D710) => 'φ',
		chr(0x1D712) => 'χ',
		chr(0x1D713) => 'ψ',
		chr(0x1D714) => 'ω',
		chr(0x1D715) => '∂',
		chr(0x1D716) => 'ϵ',
		chr(0x1D717) => 'ϑ',
		chr(0x1D718) => 'κ',
		chr(0x1D719) => 'ϕ',
		chr(0x1D71A) => 'ρ',
		chr(0x1D71B) => 'ϖ',
		chr(0x1D71C) => 'Α',
		chr(0x1D71D) => 'Β',
		chr(0x1D71E) => 'Γ',
		chr(0x1D71F) => 'Δ',
		chr(0x1D720) => 'Ε',
		chr(0x1D721) => 'Ζ',
		chr(0x1D722) => 'Η',
		chr(0x1D723) => 'Θ',
		chr(0x1D724) => 'Ι',
		chr(0x1D725) => 'Κ',
		chr(0x1D726) => 'Λ',
		chr(0x1D727) => 'Μ',
		chr(0x1D728) => 'Ν',
		chr(0x1D729) => 'Ξ',
		chr(0x1D72A) => 'Ο',
		chr(0x1D72B) => 'Π',
		chr(0x1D72C) => 'Ρ',
		chr(0x1D72D) => 'θ',
		chr(0x1D72E) => 'Σ',
		chr(0x1D72F) => 'Τ',
		chr(0x1D730) => 'Υ',
		chr(0x1D731) => 'Φ',
		chr(0x1D732) => 'Χ',
		chr(0x1D733) => 'Ψ',
		chr(0x1D734) => 'Ω',
		chr(0x1D735) => '∇',
		chr(0x1D736) => 'α',
		chr(0x1D737) => 'β',
		chr(0x1D738) => 'γ',
		chr(0x1D739) => 'δ',
		chr(0x1D73A) => 'ε',
		chr(0x1D73B) => 'ζ',
		chr(0x1D73C) => 'η',
		chr(0x1D73D) => 'θ',
		chr(0x1D73E) => 'ι',
		chr(0x1D73F) => 'κ',
		chr(0x1D740) => 'λ',
		chr(0x1D741) => 'μ',
		chr(0x1D742) => 'ν',
		chr(0x1D743) => 'ξ',
		chr(0x1D744) => 'ο',
		chr(0x1D745) => 'π',
		chr(0x1D746) => 'ρ',
		chr(0x1D747) => 'ς',
		chr(0x1D748) => 'σ',
		chr(0x1D749) => 'τ',
		chr(0x1D74A) => 'υ',
		chr(0x1D74B) => 'φ',
		chr(0x1D74C) => 'χ',
		chr(0x1D74D) => 'ψ',
		chr(0x1D74E) => 'ω',
		chr(0x1D74F) => '∂',
		chr(0x1D750) => 'ϵ',
		chr(0x1D751) => 'ϑ',
		chr(0x1D752) => 'κ',
		chr(0x1D753) => 'ϕ',
		chr(0x1D754) => 'ρ',
		chr(0x1D755) => 'ϖ',
		chr(0x1D756) => 'Α',
		chr(0x1D757) => 'Β',
		chr(0x1D758) => 'Γ',
		chr(0x1D759) => 'Δ',
		chr(0x1D75A) => 'Ε',
		chr(0x1D75B) => 'Ζ',
		chr(0x1D75C) => 'Η',
		chr(0x1D75D) => 'Θ',
		chr(0x1D75E) => 'Ι',
		chr(0x1D75F) => 'Κ',
		chr(0x1D760) => 'Λ',
		chr(0x1D761) => 'Μ',
		chr(0x1D762) => 'Ν',
		chr(0x1D763) => 'Ξ',
		chr(0x1D764) => 'Ο',
		chr(0x1D765) => 'Π',
		chr(0x1D766) => 'Ρ',
		chr(0x1D767) => 'θ',
		chr(0x1D768) => 'Σ',
		chr(0x1D769) => 'Τ',
		chr(0x1D76A) => 'Υ',
		chr(0x1D76B) => 'Φ',
		chr(0x1D76C) => 'Χ',
		chr(0x1D76D) => 'Ψ',
		chr(0x1D76E) => 'Ω',
		chr(0x1D76F) => '∇',
		chr(0x1D770) => 'α',
		chr(0x1D771) => 'β',
		chr(0x1D772) => 'γ',
		chr(0x1D773) => 'δ',
		chr(0x1D774) => 'ε',
		chr(0x1D775) => 'ζ',
		chr(0x1D776) => 'η',
		chr(0x1D777) => 'θ',
		chr(0x1D778) => 'ι',
		chr(0x1D779) => 'κ',
		chr(0x1D77A) => 'λ',
		chr(0x1D77B) => 'μ',
		chr(0x1D77C) => 'ν',
		chr(0x1D77D) => 'ξ',
		chr(0x1D77E) => 'ο',
		chr(0x1D77F) => 'π',
		chr(0x1D780) => 'ρ',
		chr(0x1D781) => 'ς',
		chr(0x1D782) => 'σ',
		chr(0x1D783) => 'τ',
		chr(0x1D784) => 'υ',
		chr(0x1D785) => 'φ',
		chr(0x1D786) => 'χ',
		chr(0x1D787) => 'ψ',
		chr(0x1D788) => 'ω',
		chr(0x1D789) => '∂',
		chr(0x1D78A) => 'ϵ',
		chr(0x1D78B) => 'ϑ',
		chr(0x1D78C) => 'κ',
		chr(0x1D78D) => 'ϕ',
		chr(0x1D78E) => 'ρ',
		chr(0x1D78F) => 'ϖ',
		chr(0x1D790) => 'Α',
		chr(0x1D791) => 'Β',
		chr(0x1D792) => 'Γ',
		chr(0x1D793) => 'Δ',
		chr(0x1D794) => 'Ε',
		chr(0x1D795) => 'Ζ',
		chr(0x1D796) => 'Η',
		chr(0x1D797) => 'Θ',
		chr(0x1D798) => 'Ι',
		chr(0x1D799) => 'Κ',
		chr(0x1D79A) => 'Λ',
		chr(0x1D79B) => 'Μ',
		chr(0x1D79C) => 'Ν',
		chr(0x1D79D) => 'Ξ',
		chr(0x1D79E) => 'Ο',
		chr(0x1D79F) => 'Π',
		chr(0x1D7A0) => 'Ρ',
		chr(0x1D7A1) => 'θ',
		chr(0x1D7A2) => 'Σ',
		chr(0x1D7A3) => 'Τ',
		chr(0x1D7A4) => 'Υ',
		chr(0x1D7A5) => 'Φ',
		chr(0x1D7A6) => 'Χ',
		chr(0x1D7A7) => 'Ψ',
		chr(0x1D7A8) => 'Ω',
		chr(0x1D7A9) => '∇',
		chr(0x1D7AA) => 'α',
		chr(0x1D7AB) => 'β',
		chr(0x1D7AC) => 'γ',
		chr(0x1D7AD) => 'δ',
		chr(0x1D7AE) => 'ε',
		chr(0x1D7AF) => 'ζ',
		chr(0x1D7B0) => 'η',
		chr(0x1D7B1) => 'θ',
		chr(0x1D7B2) => 'ι',
		chr(0x1D7B3) => 'κ',
		chr(0x1D7B4) => 'λ',
		chr(0x1D7B5) => 'μ',
		chr(0x1D7B6) => 'ν',
		chr(0x1D7B7) => 'ξ',
		chr(0x1D7B8) => 'ο',
		chr(0x1D7B9) => 'π',
		chr(0x1D7BA) => 'ρ',
		chr(0x1D7BB) => 'ς',
		chr(0x1D7BC) => 'σ',
		chr(0x1D7BD) => 'τ',
		chr(0x1D7BE) => 'υ',
		chr(0x1D7BF) => 'φ',
		chr(0x1D7C0) => 'χ',
		chr(0x1D7C1) => 'ψ',
		chr(0x1D7C2) => 'ω',
		chr(0x1D7C3) => '∂',
		chr(0x1D7C4) => 'ϵ',
		chr(0x1D7C5) => 'ϑ',
		chr(0x1D7C6) => 'κ',
		chr(0x1D7C7) => 'ϕ',
		chr(0x1D7C8) => 'ρ',
		chr(0x1D7C9) => 'ϖ',
		chr(0x1D7CA) => 'Ϝ',
		chr(0x1D7CB) => 'ϝ',
		chr(0x1D7CC) => '',
		chr(0x1D7CD) => '',
		chr(0x1D7CE) => '0',
		chr(0x1D7CF) => '1',
		chr(0x1D7D0) => '2',
		chr(0x1D7D1) => '3',
		chr(0x1D7D2) => '4',
		chr(0x1D7D3) => '5',
		chr(0x1D7D4) => '6',
		chr(0x1D7D5) => '7',
		chr(0x1D7D6) => '8',
		chr(0x1D7D7) => '9',
		chr(0x1D7D8) => '0',
		chr(0x1D7D9) => '1',
		chr(0x1D7DA) => '2',
		chr(0x1D7DB) => '3',
		chr(0x1D7DC) => '4',
		chr(0x1D7DD) => '5',
		chr(0x1D7DE) => '6',
		chr(0x1D7DF) => '7',
		chr(0x1D7E0) => '8',
		chr(0x1D7E1) => '9',
		chr(0x1D7E2) => '0',
		chr(0x1D7E3) => '1',
		chr(0x1D7E4) => '2',
		chr(0x1D7E5) => '3',
		chr(0x1D7E6) => '4',
		chr(0x1D7E7) => '5',
		chr(0x1D7E8) => '6',
		chr(0x1D7E9) => '7',
		chr(0x1D7EA) => '8',
		chr(0x1D7EB) => '9',
		chr(0x1D7EC) => '0',
		chr(0x1D7ED) => '1',
		chr(0x1D7EE) => '2',
		chr(0x1D7EF) => '3',
		chr(0x1D7F0) => '4',
		chr(0x1D7F1) => '5',
		chr(0x1D7F2) => '6',
		chr(0x1D7F3) => '7',
		chr(0x1D7F4) => '8',
		chr(0x1D7F5) => '9',
		chr(0x1D7F6) => '0',
		chr(0x1D7F7) => '1',
		chr(0x1D7F8) => '2',
		chr(0x1D7F9) => '3',
		chr(0x1D7FA) => '4',
		chr(0x1D7FB) => '5',
		chr(0x1D7FC) => '6',
		chr(0x1D7FD) => '7',
		chr(0x1D7FE) => '8',
		chr(0x1D7FF) => '9',		
};

# Minimum size word to normally index.
$EPrints::Index::FREETEXT_MIN_WORD_SIZE = 3;

# We use a hash rather than an array for good and bad
# words as we only use these to lookup if words are in
# them or not. If we used arrays and we had lots of words
# it might slow things down.

# Words to never index, despite their length.
$EPrints::Index::FREETEXT_STOP_WORDS = {
	"this"=>1,	"are"=>1,	"which"=>1,	"with"=>1,
	"that"=>1,	"can"=>1,	"from"=>1,	"these"=>1,
	"those"=>1,	"the"=>1,	"you"=>1,	"for"=>1,
	"been"=>1,	"have"=>1,	"were"=>1,	"what"=>1,
	"where"=>1,	"is"=>1,	"and"=>1, 	"fnord"=>1
};

# Words to always index, despite their length.
$EPrints::Index::FREETEXT_ALWAYS_WORDS = {
		"ok" => 1 
};

# Chars which separate words. Pretty much anything except
# A-Z a-z 0-9 and single quote '

# If you want to add other separator characters then they
# should be encoded in utf8. The Unicode::String man page
# details some useful methods.

$EPrints::Index::FREETEXT_SEPERATOR_CHARS = {
	'@' => 1, 	'[' => 1, 	'\\' => 1, 	']' => 1,
	'^' => 1, 	'_' => 1,	' ' => 1, 	'`' => 1,
	'!' => 1, 	'"' => 1, 	'#' => 1, 	'$' => 1,
	'%' => 1, 	'&' => 1, 	'(' => 1, 	')' => 1,
	'*' => 1, 	'+' => 1, 	',' => 1, 	'-' => 1,
	'.' => 1, 	'/' => 1, 	':' => 1, 	';' => 1,
	'{' => 1, 	'<' => 1, 	'|' => 1, 	'=' => 1,
	'}' => 1, 	'>' => 1, 	'~' => 1, 	'?' => 1,
	chr(0xb4) => 1, # Acute Accent (closing quote)
};
$EPrints::Index::FREETEXT_SEPERATOR_REGEXP = quotemeta(join "", keys %$EPrints::Index::FREETEXT_SEPERATOR_CHARS);
$EPrints::Index::FREETEXT_SEPERATOR_REGEXP = qr/[$EPrints::Index::FREETEXT_SEPERATOR_REGEXP\x00-\x20]/;

# Compatibility with Unicode::String keys
foreach my $mapping (
	$EPrints::Index::FREETEXT_CHAR_MAPPING,
	$EPrints::Index::FREETEXT_SEPERATOR_CHARS
)
{
	foreach my $char (keys %$mapping)
	{
		my $bytes = $char;
		utf8::encode($bytes);
		$mapping->{$bytes} = $mapping->{$char};
	}
}

1;

=head1 COPYRIGHT

=begin COPYRIGHT

Copyright 2024 University of Southampton.
EPrints 3.4 is supplied by EPrints Services.

http://www.eprints.org/eprints-3.4/

=end COPYRIGHT

=begin LICENSE

This file is part of EPrints 3.4 L<http://www.eprints.org/>.

EPrints 3.4 and this file are released under the terms of the
GNU Lesser General Public License version 3 as published by
the Free Software Foundation unless otherwise stated.

EPrints 3.4 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with EPrints 3.4.
If not, see L<http://www.gnu.org/licenses/>.

=end LICENSE
