1 /** 2 * Character Set Handling 3 * 4 * Copyright: 5 * (C) 1999-2007 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.utils.charset; 12 13 import botan.constants; 14 import std.array : Appender; 15 import botan.utils.types; 16 import botan.utils.exceptn; 17 import std.conv : to; 18 19 /** 20 * The different charsets (nominally) supported by Botan. 21 */ 22 alias ubyte CharacterSet; 23 enum : CharacterSet { 24 LOCAL_CHARSET, 25 UCS2_CHARSET, 26 UTF8_CHARSET, 27 LATIN1_CHARSET 28 } 29 30 /* 31 * Character Set Handling 32 */ 33 /* 34 * Convert from UCS-2 to ISO 8859-1 35 */ 36 string ucs2ToLatin1(in string ucs2) 37 { 38 if (ucs2.length % 2 == 1) 39 throw new DecodingError("UCS-2 string has an odd number of bytes"); 40 41 Vector!char latin1; 42 latin1.reserve(ucs2.length * 2); 43 44 for (size_t i = 0; i != ucs2.length; i += 2) 45 { 46 const ubyte c1 = ucs2[i]; 47 const ubyte c2 = ucs2[i+1]; 48 49 if (c1 != 0) 50 throw new DecodingError("UCS-2 has non-Latin1 characters"); 51 52 latin1 ~= cast(char)(c2); 53 } 54 55 string ret = latin1.ptr[0 .. latin1.length].idup; 56 57 //logDebug(ret); 58 59 return ret; 60 } 61 62 /* 63 * Convert from UTF-8 to ISO 8859-1 64 */ 65 string utf8ToLatin1(in string utf8) 66 { 67 Vector!char iso8859; 68 iso8859.reserve(utf8.length); 69 size_t position = 0; 70 while (position != utf8.length) 71 { 72 const ubyte c1 = cast(ubyte)(utf8[position++]); 73 74 if (c1 <= 0x7F) 75 iso8859 ~= cast(char)(c1); 76 else if (c1 >= 0xC0 && c1 <= 0xC7) 77 { 78 if (position == utf8.length) 79 throw new DecodingError("UTF-8: sequence truncated"); 80 81 const ubyte c2 = cast(ubyte)(utf8[position++]); 82 const ubyte iso_char = cast(ubyte)((c1 & 0x07) << 6) | (c2 & 0x3F); 83 84 if (iso_char <= 0x7F) 85 throw new DecodingError("UTF-8: sequence longer than needed"); 86 87 iso8859 ~= cast(char)(iso_char); 88 } 89 else 90 throw new DecodingError("UTF-8: Unicode chars not in Latin1 used"); 91 } 92 string ret = iso8859.ptr[0 .. iso8859.length].idup; 93 //logTrace("utf8ToLatin1: ", ret); 94 return ret; 95 } 96 97 /* 98 * Convert from ISO 8859-1 to UTF-8 99 */ 100 string latin1ToUtf8(in string iso8859) 101 { 102 Vector!char utf8; 103 utf8.reserve(iso8859.length); 104 for (size_t i = 0; i != iso8859.length; ++i) 105 { 106 const ubyte c = cast(ubyte)(iso8859[i]); 107 108 if (c <= 0x7F) 109 utf8 ~= cast(char)(c); 110 else 111 { 112 utf8 ~= cast(char)((0xC0 | (c >> 6))); 113 utf8 ~= cast(char)((0x80 | (c & 0x3F))); 114 } 115 } 116 string ret = utf8.ptr[0 .. utf8.length].idup; 117 //logTrace("latin1ToUtf8: ", ret); 118 return ret; 119 } 120 121 /* 122 * Perform character set transcoding 123 */ 124 string transcode(in string str, CharacterSet to, CharacterSet from) 125 { 126 if (to == LOCAL_CHARSET) 127 to = LATIN1_CHARSET; 128 if (from == LOCAL_CHARSET) 129 from = LATIN1_CHARSET; 130 131 if (to == from) 132 return str; 133 134 if (from == LATIN1_CHARSET && to == UTF8_CHARSET) 135 return latin1ToUtf8(str); 136 if (from == UTF8_CHARSET && to == LATIN1_CHARSET) 137 return utf8ToLatin1(str); 138 if (from == UCS2_CHARSET && to == LATIN1_CHARSET) 139 return ucs2ToLatin1(str); 140 141 throw new InvalidArgument("Unknown transcoding operation from " ~ .to!string(from) ~ " to " ~ .to!string(to)); 142 } 143 144 /* 145 * Check if a character represents a digit 146 */ 147 bool isDigit(char c) 148 { 149 if (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || 150 c == '5' || c == '6' || c == '7' || c == '8' || c == '9') 151 return true; 152 return false; 153 } 154 155 /* 156 * Check if a character represents whitespace 157 */ 158 bool isSpace(char c) 159 { 160 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 161 return true; 162 return false; 163 } 164 165 /* 166 * Convert a character to a digit 167 */ 168 ubyte char2digit(char c) 169 { 170 switch(c) 171 { 172 case '0': return 0; 173 case '1': return 1; 174 case '2': return 2; 175 case '3': return 3; 176 case '4': return 4; 177 case '5': return 5; 178 case '6': return 6; 179 case '7': return 7; 180 case '8': return 8; 181 case '9': return 9; 182 default: 183 throw new InvalidArgument("char2digit: Input is not a digit character"); 184 } 185 186 } 187 188 /* 189 * Convert a digit to a character 190 */ 191 char digit2char(ubyte b) 192 { 193 switch(b) 194 { 195 case 0: return '0'; 196 case 1: return '1'; 197 case 2: return '2'; 198 case 3: return '3'; 199 case 4: return '4'; 200 case 5: return '5'; 201 case 6: return '6'; 202 case 7: return '7'; 203 case 8: return '8'; 204 case 9: return '9'; 205 default: 206 throw new InvalidArgument("digit2char: Input is not a digit: " ~ b.to!string); 207 } 208 209 } 210 211 /* 212 * Case-insensitive character comparison 213 */ 214 bool caselessCmp(T)(T a, T b) 215 { 216 import std.ascii : toLower; 217 return (toLower(a) == toLower(b)); 218 }