utf8.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. /* Licensed to the Apache Software Foundation (ASF) under one or more
  2. * contributor license agreements. See the NOTICE file distributed with
  3. * this work for additional information regarding copyright ownership.
  4. * The ASF licenses this file to You under the Apache License, Version 2.0
  5. * (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "fspr.h"
  17. #include "fspr_private.h"
  18. #include "fspr_errno.h"
  19. #include "fspr_arch_utf8.h"
  20. /* Implement the design principal specified by RFC 2718 2.2.5
  21. * Guidelines for new URL Schemes - within the APR.
  22. *
  23. * Since many architectures support unicode, and UCS2 is the most
  24. * efficient storage used by those archictures, these functions
  25. * exist to validate a UCS string. It is up to the operating system
  26. * to determine the validitity of the string in the context of it's
  27. * native language support. File systems that support filename
  28. * characters of 0x80-0xff but have no support of Unicode will find
  29. * this function useful only for validating the character sequences
  30. * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
  31. * desired.
  32. *
  33. * from RFC 2279 UTF-8, a transformation format of ISO 10646
  34. *
  35. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  36. * 1:2 0000 0000-0000 007F 0xxxxxxx
  37. * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx
  38. * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx
  39. * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
  40. * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
  41. * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  42. *
  43. * One of the X values must be one for the encoding length to be legit.
  44. * Neither the z bit, nor the final two forms, are used for ucs-2
  45. *
  46. * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
  47. * Unicode parlance), being actually UCS-4 characters transformed
  48. * through UTF-16, need special treatment: the UTF-16 transformation
  49. * must be undone, yielding a UCS-4 character that is then transformed
  50. * as above."
  51. *
  52. * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  53. *
  54. * U' = U - 0x10000
  55. * U' = 000000000000yyyyyyyyyyxxxxxxxxxx
  56. * W1 = 110110yyyyyyyyyy
  57. * W2 = 110111xxxxxxxxxx
  58. *
  59. * fspr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
  60. *
  61. * fspr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  62. */
  63. APR_DECLARE(fspr_status_t) fspr_conv_utf8_to_ucs2(const char *in,
  64. fspr_size_t *inbytes,
  65. fspr_wchar_t *out,
  66. fspr_size_t *outwords)
  67. {
  68. fspr_int64_t newch, mask;
  69. fspr_size_t expect, eating;
  70. int ch;
  71. while (*inbytes && *outwords)
  72. {
  73. ch = (unsigned char)(*in++);
  74. if (!(ch & 0200)) {
  75. /* US-ASCII-7 plain text
  76. */
  77. --*inbytes;
  78. --*outwords;
  79. *(out++) = ch;
  80. }
  81. else
  82. {
  83. if ((ch & 0300) != 0300) {
  84. /* Multibyte Continuation is out of place
  85. */
  86. return APR_EINVAL;
  87. }
  88. else
  89. {
  90. /* Multibyte Sequence Lead Character
  91. *
  92. * Compute the expected bytes while adjusting
  93. * or lead byte and leading zeros mask.
  94. */
  95. mask = 0340;
  96. expect = 1;
  97. while ((ch & mask) == mask) {
  98. mask |= mask >> 1;
  99. if (++expect > 3) /* (truly 5 for ucs-4) */
  100. return APR_EINVAL;
  101. }
  102. newch = ch & ~mask;
  103. eating = expect + 1;
  104. if (*inbytes <= expect)
  105. return APR_INCOMPLETE;
  106. /* Reject values of excessive leading 0 bits
  107. * utf-8 _demands_ the shortest possible byte length
  108. */
  109. if (expect == 1) {
  110. if (!(newch & 0036))
  111. return APR_EINVAL;
  112. }
  113. else {
  114. /* Reject values of excessive leading 0 bits
  115. */
  116. if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
  117. return APR_EINVAL;
  118. if (expect == 2) {
  119. /* Reject values D800-DFFF when not utf16 encoded
  120. * (may not be an appropriate restriction for ucs-4)
  121. */
  122. if (newch == 0015 && ((unsigned char)*in & 0040))
  123. return APR_EINVAL;
  124. }
  125. else if (expect == 3) {
  126. /* Short circuit values > 110000
  127. */
  128. if (newch > 4)
  129. return APR_EINVAL;
  130. if (newch == 4 && ((unsigned char)*in & 0060))
  131. return APR_EINVAL;
  132. }
  133. }
  134. /* Where the boolean (expect > 2) is true, we will need
  135. * an extra word for the output.
  136. */
  137. if (*outwords < (fspr_size_t)(expect > 2) + 1)
  138. break; /* buffer full */
  139. while (expect--)
  140. {
  141. /* Multibyte Continuation must be legal */
  142. if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
  143. return APR_EINVAL;
  144. newch <<= 6;
  145. newch |= (ch & 0077);
  146. }
  147. *inbytes -= eating;
  148. /* newch is now a true ucs-4 character
  149. *
  150. * now we need to fold to ucs-2
  151. */
  152. if (newch < 0x10000)
  153. {
  154. --*outwords;
  155. *(out++) = (fspr_wchar_t) newch;
  156. }
  157. else
  158. {
  159. *outwords -= 2;
  160. newch -= 0x10000;
  161. *(out++) = (fspr_wchar_t) (0xD800 | (newch >> 10));
  162. *(out++) = (fspr_wchar_t) (0xDC00 | (newch & 0x03FF));
  163. }
  164. }
  165. }
  166. }
  167. /* Buffer full 'errors' aren't errors, the client must inspect both
  168. * the inbytes and outwords values
  169. */
  170. return APR_SUCCESS;
  171. }
  172. APR_DECLARE(fspr_status_t) fspr_conv_ucs2_to_utf8(const fspr_wchar_t *in,
  173. fspr_size_t *inwords,
  174. char *out,
  175. fspr_size_t *outbytes)
  176. {
  177. fspr_int64_t newch, require;
  178. fspr_size_t need;
  179. char *invout;
  180. int ch;
  181. while (*inwords && *outbytes)
  182. {
  183. ch = (unsigned short)(*in++);
  184. if (ch < 0x80)
  185. {
  186. --*inwords;
  187. --*outbytes;
  188. *(out++) = (unsigned char) ch;
  189. }
  190. else
  191. {
  192. if ((ch & 0xFC00) == 0xDC00) {
  193. /* Invalid Leading ucs-2 Multiword Continuation Character
  194. */
  195. return APR_EINVAL;
  196. }
  197. if ((ch & 0xFC00) == 0xD800) {
  198. /* Leading ucs-2 Multiword Character
  199. */
  200. if (*inwords < 2) {
  201. /* Missing ucs-2 Multiword Continuation Character
  202. */
  203. return APR_INCOMPLETE;
  204. }
  205. if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
  206. /* Invalid ucs-2 Multiword Continuation Character
  207. */
  208. return APR_EINVAL;
  209. }
  210. newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
  211. newch += 0x10000;
  212. }
  213. else {
  214. /* ucs-2 Single Word Character
  215. */
  216. newch = ch;
  217. }
  218. /* Determine the absolute minimum utf-8 bytes required
  219. */
  220. require = newch >> 11;
  221. need = 1;
  222. while (require)
  223. require >>= 5, ++need;
  224. if (need >= *outbytes)
  225. break; /* Insufficient buffer */
  226. *inwords -= (need > 2) + 1;
  227. *outbytes -= need + 1;
  228. /* Compute the utf-8 characters in last to first order,
  229. * calculating the lead character length bits along the way.
  230. */
  231. ch = 0200;
  232. out += need + 1;
  233. invout = out;
  234. while (need--) {
  235. ch |= ch >> 1;
  236. *(--invout) = (unsigned char)(0200 | (newch & 0077));
  237. newch >>= 6;
  238. }
  239. /* Compute the lead utf-8 character and move the dest offset
  240. */
  241. *(--invout) = (unsigned char)(ch | newch);
  242. }
  243. }
  244. /* Buffer full 'errors' aren't errors, the client must inspect both
  245. * the inwords and outbytes values
  246. */
  247. return APR_SUCCESS;
  248. }