1 /**
2 * Character Set Handling
3 * 
4 * Copyright:
5 * (C) 1999-2007 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.utils.charset;
12 
13 import botan.constants;
14 import std.array : Appender;
15 import botan.utils.types;
16 import botan.utils.exceptn;
17 import std.conv : to;
18 
19 /**
20 * The different charsets (nominally) supported by Botan.
21 */
22 alias ubyte CharacterSet;
23 enum : CharacterSet {
24     LOCAL_CHARSET,
25     UCS2_CHARSET,
26     UTF8_CHARSET,
27     LATIN1_CHARSET
28 }
29 
30 /*
31 * Character Set Handling
32 */
33 /*
34 * Convert from UCS-2 to ISO 8859-1
35 */
36 string ucs2ToLatin1(in string ucs2)
37 {
38     if (ucs2.length % 2 == 1)
39         throw new DecodingError("UCS-2 string has an odd number of bytes");
40     
41     Vector!char latin1;
42     latin1.reserve(ucs2.length * 2);
43 
44     for (size_t i = 0; i != ucs2.length; i += 2)
45     {
46         const ubyte c1 = ucs2[i];
47         const ubyte c2 = ucs2[i+1];
48         
49         if (c1 != 0)
50             throw new DecodingError("UCS-2 has non-Latin1 characters");
51         
52         latin1 ~= cast(char)(c2);
53     }
54 
55     string ret = latin1.ptr[0 .. latin1.length].idup;
56 
57     //logDebug(ret);
58     
59     return ret;
60 }
61 
62 /*
63 * Convert from UTF-8 to ISO 8859-1
64 */
65 string utf8ToLatin1(in string utf8)
66 {
67     Vector!char iso8859;
68     iso8859.reserve(utf8.length);
69     size_t position = 0;
70     while (position != utf8.length)
71     {
72         const ubyte c1 = cast(ubyte)(utf8[position++]);
73         
74         if (c1 <= 0x7F)
75             iso8859 ~= cast(char)(c1);
76         else if (c1 >= 0xC0 && c1 <= 0xC7)
77         {
78             if (position == utf8.length)
79                 throw new DecodingError("UTF-8: sequence truncated");
80             
81             const ubyte c2 = cast(ubyte)(utf8[position++]);
82             const ubyte iso_char = cast(ubyte)((c1 & 0x07) << 6) | (c2 & 0x3F);
83             
84             if (iso_char <= 0x7F)
85                 throw new DecodingError("UTF-8: sequence longer than needed");
86             
87             iso8859 ~= cast(char)(iso_char);
88         }
89         else
90             throw new DecodingError("UTF-8: Unicode chars not in Latin1 used");
91     }
92     string ret = iso8859.ptr[0 .. iso8859.length].idup;
93     //logTrace("utf8ToLatin1: ", ret);
94     return ret;
95 }
96 
97 /*
98 * Convert from ISO 8859-1 to UTF-8
99 */
100 string latin1ToUtf8(in string iso8859)
101 {
102     Vector!char utf8;
103     utf8.reserve(iso8859.length);
104     for (size_t i = 0; i != iso8859.length; ++i)
105     {
106         const ubyte c = cast(ubyte)(iso8859[i]);
107         
108         if (c <= 0x7F)
109             utf8 ~= cast(char)(c);
110         else
111         {
112             utf8 ~= cast(char)((0xC0 | (c >> 6)));
113             utf8 ~= cast(char)((0x80 | (c & 0x3F)));
114         }
115     }
116     string ret = utf8.ptr[0 .. utf8.length].idup;
117     //logTrace("latin1ToUtf8: ", ret);
118     return ret;
119 }
120 
121 /*
122 * Perform character set transcoding
123 */
124 string transcode(in string str, CharacterSet to, CharacterSet from)
125 {
126     if (to == LOCAL_CHARSET)
127         to = LATIN1_CHARSET;
128     if (from == LOCAL_CHARSET)
129         from = LATIN1_CHARSET;
130     
131     if (to == from)
132         return str;
133     
134     if (from == LATIN1_CHARSET && to == UTF8_CHARSET)
135         return latin1ToUtf8(str);
136     if (from == UTF8_CHARSET && to == LATIN1_CHARSET)
137         return utf8ToLatin1(str);
138     if (from == UCS2_CHARSET && to == LATIN1_CHARSET)
139         return ucs2ToLatin1(str);
140     
141     throw new InvalidArgument("Unknown transcoding operation from " ~ .to!string(from) ~ " to " ~ .to!string(to));
142 }
143 
144 /*
145 * Check if a character represents a digit
146 */
147 bool isDigit(char c)
148 {
149     if (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
150         c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
151         return true;
152     return false;
153 }
154 
155 /*
156 * Check if a character represents whitespace
157 */
158 bool isSpace(char c)
159 {
160     if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
161         return true;
162     return false;
163 }
164 
165 /*
166 * Convert a character to a digit
167 */
168 ubyte char2digit(char c)
169 {
170     switch(c)
171     {
172         case '0': return 0;
173         case '1': return 1;
174         case '2': return 2;
175         case '3': return 3;
176         case '4': return 4;
177         case '5': return 5;
178         case '6': return 6;
179         case '7': return 7;
180         case '8': return 8;
181         case '9': return 9;
182         default: 
183             throw new InvalidArgument("char2digit: Input is not a digit character");
184     }
185     
186 }
187 
188 /*
189 * Convert a digit to a character
190 */
191 char digit2char(ubyte b)
192 {
193     switch(b)
194     {
195         case 0: return '0';
196         case 1: return '1';
197         case 2: return '2';
198         case 3: return '3';
199         case 4: return '4';
200         case 5: return '5';
201         case 6: return '6';
202         case 7: return '7';
203         case 8: return '8';
204         case 9: return '9';
205         default:
206             throw new InvalidArgument("digit2char: Input is not a digit: " ~ b.to!string);
207     }
208     
209 }
210 
211 /*
212 * Case-insensitive character comparison
213 */
214 bool caselessCmp(T)(T a, T b)
215 {
216     import std.ascii : toLower;
217     return (toLower(a) == toLower(b));
218 }