Source: lib/util/string_utils.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.util.StringUtils');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.log');
  9. goog.require('shaka.util.BufferUtils');
  10. goog.require('shaka.util.Error');
  11. goog.require('shaka.util.Lazy');
  12. /**
  13. * @namespace shaka.util.StringUtils
  14. * @summary A set of string utility functions.
  15. * @export
  16. */
  17. shaka.util.StringUtils = class {
  18. /**
  19. * Creates a string from the given buffer as UTF-8 encoding.
  20. *
  21. * @param {?BufferSource} data
  22. * @return {string}
  23. * @export
  24. */
  25. static fromUTF8(data) {
  26. if (!data) {
  27. return '';
  28. }
  29. let uint8 = shaka.util.BufferUtils.toUint8(data);
  30. // If present, strip off the UTF-8 BOM.
  31. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  32. uint8 = uint8.subarray(3);
  33. }
  34. // Use the TextDecoder interface to decode the text. This has the advantage
  35. // compared to the previously-standard decodeUriComponent that it will
  36. // continue parsing even if it finds an invalid UTF8 character, rather than
  37. // stop and throw an error.
  38. const utf8decoder = new TextDecoder();
  39. const decoded = utf8decoder.decode(uint8);
  40. if (decoded.includes('\uFFFD')) {
  41. shaka.log.alwaysError('Decoded string contains an "unknown character" ' +
  42. 'codepoint. That probably means the UTF8 ' +
  43. 'encoding was incorrect!');
  44. }
  45. return decoded;
  46. }
  47. /**
  48. * Creates a string from the given buffer as UTF-16 encoding.
  49. *
  50. * @param {?BufferSource} data
  51. * @param {boolean} littleEndian
  52. true to read little endian, false to read big.
  53. * @param {boolean=} noThrow true to avoid throwing in cases where we may
  54. * expect invalid input. If noThrow is true and the data has an odd
  55. * length,it will be truncated.
  56. * @return {string}
  57. * @export
  58. */
  59. static fromUTF16(data, littleEndian, noThrow) {
  60. if (!data) {
  61. return '';
  62. }
  63. if (!noThrow && data.byteLength % 2 != 0) {
  64. shaka.log.error('Data has an incorrect length, must be even.');
  65. throw new shaka.util.Error(
  66. shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
  67. shaka.util.Error.Code.BAD_ENCODING);
  68. }
  69. // Use a DataView to ensure correct endianness.
  70. const length = Math.floor(data.byteLength / 2);
  71. const arr = new Uint16Array(length);
  72. const dataView = shaka.util.BufferUtils.toDataView(data);
  73. for (let i = 0; i < length; i++) {
  74. arr[i] = dataView.getUint16(i * 2, littleEndian);
  75. }
  76. return shaka.util.StringUtils.fromCharCode(arr);
  77. }
  78. /**
  79. * Creates a string from the given buffer, auto-detecting the encoding that is
  80. * being used. If it cannot detect the encoding, it will throw an exception.
  81. *
  82. * @param {?BufferSource} data
  83. * @return {string}
  84. * @export
  85. */
  86. static fromBytesAutoDetect(data) {
  87. const StringUtils = shaka.util.StringUtils;
  88. if (!data) {
  89. return '';
  90. }
  91. const uint8 = shaka.util.BufferUtils.toUint8(data);
  92. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  93. return StringUtils.fromUTF8(uint8);
  94. } else if (uint8[0] == 0xfe && uint8[1] == 0xff) {
  95. return StringUtils.fromUTF16(
  96. uint8.subarray(2), /* littleEndian= */ false);
  97. } else if (uint8[0] == 0xff && uint8[1] == 0xfe) {
  98. return StringUtils.fromUTF16(uint8.subarray(2), /* littleEndian= */ true);
  99. }
  100. const isAscii = (i) => {
  101. // arr[i] >= ' ' && arr[i] <= '~';
  102. return uint8.byteLength <= i || (uint8[i] >= 0x20 && uint8[i] <= 0x7e);
  103. };
  104. shaka.log.debug(
  105. 'Unable to find byte-order-mark, making an educated guess.');
  106. if (uint8[0] == 0 && uint8[2] == 0) {
  107. return StringUtils.fromUTF16(data, /* littleEndian= */ false);
  108. } else if (uint8[1] == 0 && uint8[3] == 0) {
  109. return StringUtils.fromUTF16(data, /* littleEndian= */ true);
  110. } else if (isAscii(0) && isAscii(1) && isAscii(2) && isAscii(3)) {
  111. return StringUtils.fromUTF8(data);
  112. }
  113. throw new shaka.util.Error(
  114. shaka.util.Error.Severity.CRITICAL,
  115. shaka.util.Error.Category.TEXT,
  116. shaka.util.Error.Code.UNABLE_TO_DETECT_ENCODING);
  117. }
  118. /**
  119. * Creates a ArrayBuffer from the given string, converting to UTF-8 encoding.
  120. *
  121. * @param {string} str
  122. * @return {!ArrayBuffer}
  123. * @export
  124. */
  125. static toUTF8(str) {
  126. const utf8Encoder = new TextEncoder();
  127. return shaka.util.BufferUtils.toArrayBuffer(utf8Encoder.encode(str));
  128. }
  129. /**
  130. * Creates a ArrayBuffer from the given string, converting to UTF-16 encoding.
  131. *
  132. * @param {string} str
  133. * @param {boolean} littleEndian
  134. * @return {!ArrayBuffer}
  135. * @export
  136. */
  137. static toUTF16(str, littleEndian) {
  138. const result = new ArrayBuffer(str.length * 2);
  139. const view = new DataView(result);
  140. for (let i = 0; i < str.length; ++i) {
  141. const value = str.charCodeAt(i);
  142. view.setUint16(/* position= */ i * 2, value, littleEndian);
  143. }
  144. return result;
  145. }
  146. /**
  147. * Creates a new string from the given array of char codes.
  148. *
  149. * Using String.fromCharCode.apply is risky because you can trigger stack
  150. * errors on very large arrays. This breaks up the array into several pieces
  151. * to avoid this.
  152. *
  153. * @param {!TypedArray} array
  154. * @return {string}
  155. */
  156. static fromCharCode(array) {
  157. return shaka.util.StringUtils.fromCharCodeImpl_.value()(array);
  158. }
  159. /**
  160. * Resets the fromCharCode method's implementation.
  161. * For debug use.
  162. * @export
  163. */
  164. static resetFromCharCode() {
  165. shaka.util.StringUtils.fromCharCodeImpl_.reset();
  166. }
  167. };
  168. /** @private {!shaka.util.Lazy.<function(!TypedArray):string>} */
  169. shaka.util.StringUtils.fromCharCodeImpl_ = new shaka.util.Lazy(() => {
  170. /** @param {number} size @return {boolean} */
  171. const supportsChunkSize = (size) => {
  172. try {
  173. // The compiler will complain about suspicious value if this isn't
  174. // stored in a variable and used.
  175. const buffer = new Uint8Array(size);
  176. // This can't use the spread operator, or it blows up on Xbox One.
  177. // So we use apply() instead, which is normally not allowed.
  178. // See issue #2186 for more details.
  179. // eslint-disable-next-line no-restricted-syntax
  180. const foo = String.fromCharCode.apply(null, buffer);
  181. goog.asserts.assert(foo, 'Should get value');
  182. return foo.length > 0; // Actually use "foo", so it's not compiled out.
  183. } catch (error) {
  184. return false;
  185. }
  186. };
  187. // Different browsers support different chunk sizes; find out the largest
  188. // this browser supports so we can use larger chunks on supported browsers
  189. // but still support lower-end devices that require small chunks.
  190. // 64k is supported on all major desktop browsers.
  191. for (let size = 64 * 1024; size > 0; size /= 2) {
  192. if (supportsChunkSize(size)) {
  193. return (buffer) => {
  194. let ret = '';
  195. for (let i = 0; i < buffer.length; i += size) {
  196. const subArray = buffer.subarray(i, i + size);
  197. // This can't use the spread operator, or it blows up on Xbox One.
  198. // So we use apply() instead, which is normally not allowed.
  199. // See issue #2186 for more details.
  200. // eslint-disable-next-line no-restricted-syntax
  201. ret += String.fromCharCode.apply(null, subArray); // Issue #2186
  202. }
  203. return ret;
  204. };
  205. }
  206. }
  207. goog.asserts.assert(false, 'Unable to create a fromCharCode method');
  208. return null;
  209. });