encodings.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716
  1. // Tencent is pleased to support the open source community by making RapidJSON available.
  2. //
  3. // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
  4. //
  5. // Licensed under the MIT License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // http://opensource.org/licenses/MIT
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #ifndef RAPIDJSON_ENCODINGS_H_
  15. #define RAPIDJSON_ENCODINGS_H_
  16. #include "rapidjson.h"
  17. #ifdef _MSC_VER
  18. RAPIDJSON_DIAG_PUSH
  19. RAPIDJSON_DIAG_OFF(4244) // conversion from 'type1' to 'type2', possible loss of data
  20. RAPIDJSON_DIAG_OFF(4702) // unreachable code
  21. #elif defined(__GNUC__)
  22. RAPIDJSON_DIAG_PUSH
  23. RAPIDJSON_DIAG_OFF(effc++)
  24. RAPIDJSON_DIAG_OFF(overflow)
  25. #endif
  26. RAPIDJSON_NAMESPACE_BEGIN
  27. ///////////////////////////////////////////////////////////////////////////////
  28. // Encoding
  29. /*! \class rapidjson::Encoding
  30. \brief Concept for encoding of Unicode characters.
  31. \code
  32. concept Encoding {
  33. typename Ch; //! Type of character. A "character" is actually a code unit in unicode's definition.
  34. enum { supportUnicode = 1 }; // or 0 if not supporting unicode
  35. //! \brief Encode a Unicode codepoint to an output stream.
  36. //! \param os Output stream.
  37. //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively.
  38. template<typename OutputStream>
  39. static void Encode(OutputStream& os, unsigned codepoint);
  40. //! \brief Decode a Unicode codepoint from an input stream.
  41. //! \param is Input stream.
  42. //! \param codepoint Output of the unicode codepoint.
  43. //! \return true if a valid codepoint can be decoded from the stream.
  44. template <typename InputStream>
  45. static bool Decode(InputStream& is, unsigned* codepoint);
  46. //! \brief Validate one Unicode codepoint from an encoded stream.
  47. //! \param is Input stream to obtain codepoint.
  48. //! \param os Output for copying one codepoint.
  49. //! \return true if it is valid.
  50. //! \note This function just validating and copying the codepoint without actually decode it.
  51. template <typename InputStream, typename OutputStream>
  52. static bool Validate(InputStream& is, OutputStream& os);
  53. // The following functions are deal with byte streams.
  54. //! Take a character from input byte stream, skip BOM if exist.
  55. template <typename InputByteStream>
  56. static CharType TakeBOM(InputByteStream& is);
  57. //! Take a character from input byte stream.
  58. template <typename InputByteStream>
  59. static Ch Take(InputByteStream& is);
  60. //! Put BOM to output byte stream.
  61. template <typename OutputByteStream>
  62. static void PutBOM(OutputByteStream& os);
  63. //! Put a character to output byte stream.
  64. template <typename OutputByteStream>
  65. static void Put(OutputByteStream& os, Ch c);
  66. };
  67. \endcode
  68. */
  69. ///////////////////////////////////////////////////////////////////////////////
  70. // UTF8
  71. //! UTF-8 encoding.
  72. /*! http://en.wikipedia.org/wiki/UTF-8
  73. http://tools.ietf.org/html/rfc3629
  74. \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char.
  75. \note implements Encoding concept
  76. */
  77. template<typename CharType = char>
  78. struct UTF8 {
  79. typedef CharType Ch;
  80. enum { supportUnicode = 1 };
  81. template<typename OutputStream>
  82. static void Encode(OutputStream& os, unsigned codepoint) {
  83. if (codepoint <= 0x7F)
  84. os.Put(static_cast<Ch>(codepoint & 0xFF));
  85. else if (codepoint <= 0x7FF) {
  86. os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
  87. os.Put(static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
  88. }
  89. else if (codepoint <= 0xFFFF) {
  90. os.Put(static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
  91. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  92. os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  93. }
  94. else {
  95. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  96. os.Put(static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
  97. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
  98. os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  99. os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  100. }
  101. }
  102. template<typename OutputStream>
  103. static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
  104. if (codepoint <= 0x7F)
  105. PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
  106. else if (codepoint <= 0x7FF) {
  107. PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
  108. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
  109. }
  110. else if (codepoint <= 0xFFFF) {
  111. PutUnsafe(os, static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
  112. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  113. PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  114. }
  115. else {
  116. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  117. PutUnsafe(os, static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
  118. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
  119. PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
  120. PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
  121. }
  122. }
  123. template <typename InputStream>
  124. static bool Decode(InputStream& is, unsigned* codepoint) {
  125. #define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | (static_cast<unsigned char>(c) & 0x3Fu)
  126. #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
  127. #define TAIL() COPY(); TRANS(0x70)
  128. typename InputStream::Ch c = is.Take();
  129. if (!(c & 0x80)) {
  130. *codepoint = static_cast<unsigned char>(c);
  131. return true;
  132. }
  133. unsigned char type = GetRange(static_cast<unsigned char>(c));
  134. if (type >= 32) {
  135. *codepoint = 0;
  136. } else {
  137. *codepoint = (0xFF >> type) & static_cast<unsigned char>(c);
  138. }
  139. bool result = true;
  140. switch (type) {
  141. case 2: TAIL(); return result;
  142. case 3: TAIL(); TAIL(); return result;
  143. case 4: COPY(); TRANS(0x50); TAIL(); return result;
  144. case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result;
  145. case 6: TAIL(); TAIL(); TAIL(); return result;
  146. case 10: COPY(); TRANS(0x20); TAIL(); return result;
  147. case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result;
  148. default: return false;
  149. }
  150. #undef COPY
  151. #undef TRANS
  152. #undef TAIL
  153. }
  154. template <typename InputStream, typename OutputStream>
  155. static bool Validate(InputStream& is, OutputStream& os) {
  156. #define COPY() os.Put(c = is.Take())
  157. #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
  158. #define TAIL() COPY(); TRANS(0x70)
  159. Ch c;
  160. COPY();
  161. if (!(c & 0x80))
  162. return true;
  163. bool result = true;
  164. switch (GetRange(static_cast<unsigned char>(c))) {
  165. case 2: TAIL(); return result;
  166. case 3: TAIL(); TAIL(); return result;
  167. case 4: COPY(); TRANS(0x50); TAIL(); return result;
  168. case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result;
  169. case 6: TAIL(); TAIL(); TAIL(); return result;
  170. case 10: COPY(); TRANS(0x20); TAIL(); return result;
  171. case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result;
  172. default: return false;
  173. }
  174. #undef COPY
  175. #undef TRANS
  176. #undef TAIL
  177. }
  178. static unsigned char GetRange(unsigned char c) {
  179. // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  180. // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
  181. static const unsigned char type[] = {
  182. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  183. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  184. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  185. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  186. 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
  187. 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
  188. 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
  189. 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
  190. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  191. 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  192. };
  193. return type[c];
  194. }
  195. template <typename InputByteStream>
  196. static CharType TakeBOM(InputByteStream& is) {
  197. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  198. typename InputByteStream::Ch c = Take(is);
  199. if (static_cast<unsigned char>(c) != 0xEFu) return c;
  200. c = is.Take();
  201. if (static_cast<unsigned char>(c) != 0xBBu) return c;
  202. c = is.Take();
  203. if (static_cast<unsigned char>(c) != 0xBFu) return c;
  204. c = is.Take();
  205. return c;
  206. }
  207. template <typename InputByteStream>
  208. static Ch Take(InputByteStream& is) {
  209. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  210. return static_cast<Ch>(is.Take());
  211. }
  212. template <typename OutputByteStream>
  213. static void PutBOM(OutputByteStream& os) {
  214. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  215. os.Put(static_cast<typename OutputByteStream::Ch>(0xEFu));
  216. os.Put(static_cast<typename OutputByteStream::Ch>(0xBBu));
  217. os.Put(static_cast<typename OutputByteStream::Ch>(0xBFu));
  218. }
  219. template <typename OutputByteStream>
  220. static void Put(OutputByteStream& os, Ch c) {
  221. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  222. os.Put(static_cast<typename OutputByteStream::Ch>(c));
  223. }
  224. };
  225. ///////////////////////////////////////////////////////////////////////////////
  226. // UTF16
  227. //! UTF-16 encoding.
  228. /*! http://en.wikipedia.org/wiki/UTF-16
  229. http://tools.ietf.org/html/rfc2781
  230. \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead.
  231. \note implements Encoding concept
  232. \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
  233. For streaming, use UTF16LE and UTF16BE, which handle endianness.
  234. */
  235. template<typename CharType = wchar_t>
  236. struct UTF16 {
  237. typedef CharType Ch;
  238. RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2);
  239. enum { supportUnicode = 1 };
  240. template<typename OutputStream>
  241. static void Encode(OutputStream& os, unsigned codepoint) {
  242. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  243. if (codepoint <= 0xFFFF) {
  244. RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
  245. os.Put(static_cast<typename OutputStream::Ch>(codepoint));
  246. }
  247. else {
  248. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  249. unsigned v = codepoint - 0x10000;
  250. os.Put(static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
  251. os.Put((v & 0x3FF) | 0xDC00);
  252. }
  253. }
  254. template<typename OutputStream>
  255. static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
  256. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  257. if (codepoint <= 0xFFFF) {
  258. RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
  259. PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint));
  260. }
  261. else {
  262. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  263. unsigned v = codepoint - 0x10000;
  264. PutUnsafe(os, static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
  265. PutUnsafe(os, (v & 0x3FF) | 0xDC00);
  266. }
  267. }
  268. template <typename InputStream>
  269. static bool Decode(InputStream& is, unsigned* codepoint) {
  270. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
  271. typename InputStream::Ch c = is.Take();
  272. if (c < 0xD800 || c > 0xDFFF) {
  273. *codepoint = static_cast<unsigned>(c);
  274. return true;
  275. }
  276. else if (c <= 0xDBFF) {
  277. *codepoint = (static_cast<unsigned>(c) & 0x3FF) << 10;
  278. c = is.Take();
  279. *codepoint |= (static_cast<unsigned>(c) & 0x3FF);
  280. *codepoint += 0x10000;
  281. return c >= 0xDC00 && c <= 0xDFFF;
  282. }
  283. return false;
  284. }
  285. template <typename InputStream, typename OutputStream>
  286. static bool Validate(InputStream& is, OutputStream& os) {
  287. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
  288. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
  289. typename InputStream::Ch c;
  290. os.Put(static_cast<typename OutputStream::Ch>(c = is.Take()));
  291. if (c < 0xD800 || c > 0xDFFF)
  292. return true;
  293. else if (c <= 0xDBFF) {
  294. os.Put(c = is.Take());
  295. return c >= 0xDC00 && c <= 0xDFFF;
  296. }
  297. return false;
  298. }
  299. };
  300. //! UTF-16 little endian encoding.
  301. template<typename CharType = wchar_t>
  302. struct UTF16LE : UTF16<CharType> {
  303. template <typename InputByteStream>
  304. static CharType TakeBOM(InputByteStream& is) {
  305. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  306. CharType c = Take(is);
  307. return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
  308. }
  309. template <typename InputByteStream>
  310. static CharType Take(InputByteStream& is) {
  311. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  312. unsigned c = static_cast<uint8_t>(is.Take());
  313. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  314. return static_cast<CharType>(c);
  315. }
  316. template <typename OutputByteStream>
  317. static void PutBOM(OutputByteStream& os) {
  318. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  319. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  320. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  321. }
  322. template <typename OutputByteStream>
  323. static void Put(OutputByteStream& os, CharType c) {
  324. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  325. os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
  326. os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
  327. }
  328. };
  329. //! UTF-16 big endian encoding.
  330. template<typename CharType = wchar_t>
  331. struct UTF16BE : UTF16<CharType> {
  332. template <typename InputByteStream>
  333. static CharType TakeBOM(InputByteStream& is) {
  334. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  335. CharType c = Take(is);
  336. return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
  337. }
  338. template <typename InputByteStream>
  339. static CharType Take(InputByteStream& is) {
  340. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  341. unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  342. c |= static_cast<uint8_t>(is.Take());
  343. return static_cast<CharType>(c);
  344. }
  345. template <typename OutputByteStream>
  346. static void PutBOM(OutputByteStream& os) {
  347. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  348. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  349. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  350. }
  351. template <typename OutputByteStream>
  352. static void Put(OutputByteStream& os, CharType c) {
  353. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  354. os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
  355. os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
  356. }
  357. };
  358. ///////////////////////////////////////////////////////////////////////////////
  359. // UTF32
  360. //! UTF-32 encoding.
  361. /*! http://en.wikipedia.org/wiki/UTF-32
  362. \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead.
  363. \note implements Encoding concept
  364. \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
  365. For streaming, use UTF32LE and UTF32BE, which handle endianness.
  366. */
  367. template<typename CharType = unsigned>
  368. struct UTF32 {
  369. typedef CharType Ch;
  370. RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4);
  371. enum { supportUnicode = 1 };
  372. template<typename OutputStream>
  373. static void Encode(OutputStream& os, unsigned codepoint) {
  374. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
  375. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  376. os.Put(codepoint);
  377. }
  378. template<typename OutputStream>
  379. static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
  380. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
  381. RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
  382. PutUnsafe(os, codepoint);
  383. }
  384. template <typename InputStream>
  385. static bool Decode(InputStream& is, unsigned* codepoint) {
  386. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
  387. Ch c = is.Take();
  388. *codepoint = c;
  389. return c <= 0x10FFFF;
  390. }
  391. template <typename InputStream, typename OutputStream>
  392. static bool Validate(InputStream& is, OutputStream& os) {
  393. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
  394. Ch c;
  395. os.Put(c = is.Take());
  396. return c <= 0x10FFFF;
  397. }
  398. };
  399. //! UTF-32 little endian enocoding.
  400. template<typename CharType = unsigned>
  401. struct UTF32LE : UTF32<CharType> {
  402. template <typename InputByteStream>
  403. static CharType TakeBOM(InputByteStream& is) {
  404. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  405. CharType c = Take(is);
  406. return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
  407. }
  408. template <typename InputByteStream>
  409. static CharType Take(InputByteStream& is) {
  410. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  411. unsigned c = static_cast<uint8_t>(is.Take());
  412. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  413. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
  414. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
  415. return static_cast<CharType>(c);
  416. }
  417. template <typename OutputByteStream>
  418. static void PutBOM(OutputByteStream& os) {
  419. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  420. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  421. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  422. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  423. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  424. }
  425. template <typename OutputByteStream>
  426. static void Put(OutputByteStream& os, CharType c) {
  427. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  428. os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
  429. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
  430. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
  431. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
  432. }
  433. };
  434. //! UTF-32 big endian encoding.
  435. template<typename CharType = unsigned>
  436. struct UTF32BE : UTF32<CharType> {
  437. template <typename InputByteStream>
  438. static CharType TakeBOM(InputByteStream& is) {
  439. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  440. CharType c = Take(is);
  441. return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
  442. }
  443. template <typename InputByteStream>
  444. static CharType Take(InputByteStream& is) {
  445. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  446. unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
  447. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
  448. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
  449. c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
  450. return static_cast<CharType>(c);
  451. }
  452. template <typename OutputByteStream>
  453. static void PutBOM(OutputByteStream& os) {
  454. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  455. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  456. os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
  457. os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
  458. os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
  459. }
  460. template <typename OutputByteStream>
  461. static void Put(OutputByteStream& os, CharType c) {
  462. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  463. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
  464. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
  465. os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
  466. os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
  467. }
  468. };
  469. ///////////////////////////////////////////////////////////////////////////////
  470. // ASCII
  471. //! ASCII encoding.
  472. /*! http://en.wikipedia.org/wiki/ASCII
  473. \tparam CharType Code unit for storing 7-bit ASCII data. Default is char.
  474. \note implements Encoding concept
  475. */
  476. template<typename CharType = char>
  477. struct ASCII {
  478. typedef CharType Ch;
  479. enum { supportUnicode = 0 };
  480. template<typename OutputStream>
  481. static void Encode(OutputStream& os, unsigned codepoint) {
  482. RAPIDJSON_ASSERT(codepoint <= 0x7F);
  483. os.Put(static_cast<Ch>(codepoint & 0xFF));
  484. }
  485. template<typename OutputStream>
  486. static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
  487. RAPIDJSON_ASSERT(codepoint <= 0x7F);
  488. PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
  489. }
  490. template <typename InputStream>
  491. static bool Decode(InputStream& is, unsigned* codepoint) {
  492. uint8_t c = static_cast<uint8_t>(is.Take());
  493. *codepoint = c;
  494. return c <= 0X7F;
  495. }
  496. template <typename InputStream, typename OutputStream>
  497. static bool Validate(InputStream& is, OutputStream& os) {
  498. uint8_t c = static_cast<uint8_t>(is.Take());
  499. os.Put(static_cast<typename OutputStream::Ch>(c));
  500. return c <= 0x7F;
  501. }
  502. template <typename InputByteStream>
  503. static CharType TakeBOM(InputByteStream& is) {
  504. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  505. uint8_t c = static_cast<uint8_t>(Take(is));
  506. return static_cast<Ch>(c);
  507. }
  508. template <typename InputByteStream>
  509. static Ch Take(InputByteStream& is) {
  510. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  511. return static_cast<Ch>(is.Take());
  512. }
  513. template <typename OutputByteStream>
  514. static void PutBOM(OutputByteStream& os) {
  515. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  516. (void)os;
  517. }
  518. template <typename OutputByteStream>
  519. static void Put(OutputByteStream& os, Ch c) {
  520. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  521. os.Put(static_cast<typename OutputByteStream::Ch>(c));
  522. }
  523. };
  524. ///////////////////////////////////////////////////////////////////////////////
  525. // AutoUTF
  526. //! Runtime-specified UTF encoding type of a stream.
  527. enum UTFType {
  528. kUTF8 = 0, //!< UTF-8.
  529. kUTF16LE = 1, //!< UTF-16 little endian.
  530. kUTF16BE = 2, //!< UTF-16 big endian.
  531. kUTF32LE = 3, //!< UTF-32 little endian.
  532. kUTF32BE = 4 //!< UTF-32 big endian.
  533. };
  534. //! Dynamically select encoding according to stream's runtime-specified UTF encoding type.
  535. /*! \note This class can be used with AutoUTFInputtStream and AutoUTFOutputStream, which provides GetType().
  536. */
  537. template<typename CharType>
  538. struct AutoUTF {
  539. typedef CharType Ch;
  540. enum { supportUnicode = 1 };
  541. #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
  542. template<typename OutputStream>
  543. RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) {
  544. typedef void (*EncodeFunc)(OutputStream&, unsigned);
  545. static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Encode) };
  546. (*f[os.GetType()])(os, codepoint);
  547. }
  548. template<typename OutputStream>
  549. RAPIDJSON_FORCEINLINE static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
  550. typedef void (*EncodeFunc)(OutputStream&, unsigned);
  551. static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe) };
  552. (*f[os.GetType()])(os, codepoint);
  553. }
  554. template <typename InputStream>
  555. RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) {
  556. typedef bool (*DecodeFunc)(InputStream&, unsigned*);
  557. static const DecodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Decode) };
  558. return (*f[is.GetType()])(is, codepoint);
  559. }
  560. template <typename InputStream, typename OutputStream>
  561. RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
  562. typedef bool (*ValidateFunc)(InputStream&, OutputStream&);
  563. static const ValidateFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Validate) };
  564. return (*f[is.GetType()])(is, os);
  565. }
  566. #undef RAPIDJSON_ENCODINGS_FUNC
  567. };
  568. ///////////////////////////////////////////////////////////////////////////////
  569. // Transcoder
  570. //! Encoding conversion.
  571. template<typename SourceEncoding, typename TargetEncoding>
  572. struct Transcoder {
  573. //! Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the output stream.
  574. template<typename InputStream, typename OutputStream>
  575. RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) {
  576. unsigned codepoint;
  577. if (!SourceEncoding::Decode(is, &codepoint))
  578. return false;
  579. TargetEncoding::Encode(os, codepoint);
  580. return true;
  581. }
  582. template<typename InputStream, typename OutputStream>
  583. RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
  584. unsigned codepoint;
  585. if (!SourceEncoding::Decode(is, &codepoint))
  586. return false;
  587. TargetEncoding::EncodeUnsafe(os, codepoint);
  588. return true;
  589. }
  590. //! Validate one Unicode codepoint from an encoded stream.
  591. template<typename InputStream, typename OutputStream>
  592. RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
  593. return Transcode(is, os); // Since source/target encoding is different, must transcode.
  594. }
  595. };
  596. // Forward declaration.
  597. template<typename Stream>
  598. inline void PutUnsafe(Stream& stream, typename Stream::Ch c);
  599. //! Specialization of Transcoder with same source and target encoding.
  600. template<typename Encoding>
  601. struct Transcoder<Encoding, Encoding> {
  602. template<typename InputStream, typename OutputStream>
  603. RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) {
  604. os.Put(is.Take()); // Just copy one code unit. This semantic is different from primary template class.
  605. return true;
  606. }
  607. template<typename InputStream, typename OutputStream>
  608. RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
  609. PutUnsafe(os, is.Take()); // Just copy one code unit. This semantic is different from primary template class.
  610. return true;
  611. }
  612. template<typename InputStream, typename OutputStream>
  613. RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
  614. return Encoding::Validate(is, os); // source/target encoding are the same
  615. }
  616. };
  617. RAPIDJSON_NAMESPACE_END
  618. #if defined(__GNUC__) || defined(_MSC_VER)
  619. RAPIDJSON_DIAG_POP
  620. #endif
  621. #endif // RAPIDJSON_ENCODINGS_H_