You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

156 lines
4.1 KiB

  1. /* Copyright (c) 2018, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. #include <openssl/bytestring.h>
  15. #include "internal.h"
  16. static int is_valid_code_point(uint32_t v) {
  17. // References in the following are to Unicode 9.0.0.
  18. if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
  19. v > 0x10ffff ||
  20. // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
  21. // (3.4 D14)
  22. (v & 0xfffe) == 0xfffe ||
  23. (v >= 0xfdd0 && v <= 0xfdef) ||
  24. // Surrogate code points are invalid (3.2 C1).
  25. (v >= 0xd800 && v <= 0xdfff)) {
  26. return 0;
  27. }
  28. return 1;
  29. }
  30. // BOTTOM_BITS returns a byte with the bottom |n| bits set.
  31. #define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)
  32. // TOP_BITS returns a byte with the top |n| bits set.
  33. #define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))
  34. int cbs_get_utf8(CBS *cbs, uint32_t *out) {
  35. uint8_t c;
  36. if (!CBS_get_u8(cbs, &c)) {
  37. return 0;
  38. }
  39. if (c <= 0x7f) {
  40. *out = c;
  41. return 1;
  42. }
  43. uint32_t v, lower_bound;
  44. size_t len;
  45. if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
  46. v = c & BOTTOM_BITS(5);
  47. len = 1;
  48. lower_bound = 0x80;
  49. } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
  50. v = c & BOTTOM_BITS(4);
  51. len = 2;
  52. lower_bound = 0x800;
  53. } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
  54. v = c & BOTTOM_BITS(3);
  55. len = 3;
  56. lower_bound = 0x10000;
  57. } else {
  58. return 0;
  59. }
  60. for (size_t i = 0; i < len; i++) {
  61. if (!CBS_get_u8(cbs, &c) ||
  62. (c & TOP_BITS(2)) != TOP_BITS(1)) {
  63. return 0;
  64. }
  65. v <<= 6;
  66. v |= c & BOTTOM_BITS(6);
  67. }
  68. if (!is_valid_code_point(v) ||
  69. v < lower_bound) {
  70. return 0;
  71. }
  72. *out = v;
  73. return 1;
  74. }
  75. int cbs_get_latin1(CBS *cbs, uint32_t *out) {
  76. uint8_t c;
  77. if (!CBS_get_u8(cbs, &c)) {
  78. return 0;
  79. }
  80. *out = c;
  81. return 1;
  82. }
  83. int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) {
  84. // Note UCS-2 (used by BMPString) does not support surrogates.
  85. uint16_t c;
  86. if (!CBS_get_u16(cbs, &c) ||
  87. !is_valid_code_point(c)) {
  88. return 0;
  89. }
  90. *out = c;
  91. return 1;
  92. }
  93. int cbs_get_utf32_be(CBS *cbs, uint32_t *out) {
  94. return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
  95. }
  96. size_t cbb_get_utf8_len(uint32_t u) {
  97. if (u <= 0x7f) {
  98. return 1;
  99. }
  100. if (u <= 0x7ff) {
  101. return 2;
  102. }
  103. if (u <= 0xffff) {
  104. return 3;
  105. }
  106. return 4;
  107. }
  108. int cbb_add_utf8(CBB *cbb, uint32_t u) {
  109. if (!is_valid_code_point(u)) {
  110. return 0;
  111. }
  112. if (u <= 0x7f) {
  113. return CBB_add_u8(cbb, (uint8_t)u);
  114. }
  115. if (u <= 0x7ff) {
  116. return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
  117. CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  118. }
  119. if (u <= 0xffff) {
  120. return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
  121. CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
  122. CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  123. }
  124. if (u <= 0x10ffff) {
  125. return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
  126. CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
  127. CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
  128. CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  129. }
  130. return 0;
  131. }
  132. int cbb_add_latin1(CBB *cbb, uint32_t u) {
  133. return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
  134. }
  135. int cbb_add_ucs2_be(CBB *cbb, uint32_t u) {
  136. return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
  137. }
  138. int cbb_add_utf32_be(CBB *cbb, uint32_t u) {
  139. return is_valid_code_point(u) && CBB_add_u32(cbb, u);
  140. }