Code coverage report for lib/string


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225  
 
886
 
 
1914
1
 
 
 
 
 
 
 
 
 
 
 
886
1914
1914
1913
 
 
1161
1161
 
 
 
592
592
592
 
 
45
45
45
 
115
115
 
 
 
 
1798
 
1798
 
1798
 
 
 
 
 
 
 
 
 
 
 
 
886
12056
12056
12056
12056
12056
12056
12056
 
12056
 
3054
3054
 
 
3054
3054
 
3054
 
 
373
373
 
373
 
 
 
2681
2681
 
 
2681
 
 
2681
2681
34
34
34
 
2647
 
 
2647
1713
1713
 
1713
 
 
 
 
9970
3508
9970
 
9970
9970
 
2707
2707
 
 
9970
9970
 
9970
9970
 
9970
10
10
10
10
 
10
10
 
10
 
 
 
9960
 
 
 
 
 
 
886
7630
 
7630
7630
 
 
 
7630
19832
 
 
 
 
19832
357
357
357
 
 
 
19475
803
803
803
 
 
 
18672
8
8
8
 
 
 
7630
 
7630
 
 
886
639
639
37
 
639
639
66
66
66
66
 
 
639
 
 
886
4274
 
 
886
2250
2250
2250
 
 
886
90
90
90
 
 
 
  (function () { 'use strict';
 
function assertEncoding(encoding) {
  // Do not cache `Buffer.isEncoding`, some modules monkey-patch it to support
  // additional encodings
  if (encoding && !Buffer.isEncoding(encoding)) {
    throw new Error('Unknown encoding: ' + encoding);
  }
}
 
// StringDecoder provides an interface for efficiently splitting a series of
// buffers into a series of JS strings without breaking apart multi-byte
// characters. CESU-8 is handled as part of the UTF-8 encoding.
//
// @TODO Handling all encodings inside a single object makes it very difficult
// to reason about this code, so it should be split up in the future.
// @TODO There should be a utf8-strict encoding that rejects invalid UTF-8 code
// points as used by CESU-8.
const StringDecoder = exports.StringDecoder = function(encoding) {
  this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, '');
  assertEncoding(encoding);
  switch (this.encoding) {
    case 'utf8':
      // CESU-8 represents each of Surrogate Pair by 3-bytes
      this.surrogateSize = 3;
      break;
    case 'ucs2':
    case 'utf16le':
      // UTF-16 represents each of Surrogate Pair by 2-bytes
      this.surrogateSize = 2;
      this.detectIncompleteChar = utf16DetectIncompleteChar;
      break;
    case 'base64':
      // Base-64 stores 3 bytes in 4 chars, and pads the remainder.
      this.surrogateSize = 3;
      this.detectIncompleteChar = base64DetectIncompleteChar;
      break;
    default:
      this.write = passThroughWrite;
      return;
  }
 
  // Enough space to store all bytes of a single character. UTF-8 needs 4
  // bytes, but CESU-8 may require up to 6 (3 bytes per surrogate).
  this.charBuffer = new Buffer(6);
  // Number of bytes received for the current incomplete multi-byte character.
  this.charReceived = 0;
  // Number of bytes expected for the current incomplete multi-byte character.
  this.charLength = 0;
};
 
 
// write decodes the given buffer and returns it as JS string that is
// guaranteed to not contain any partial multi-byte characters. Any partial
// character found at the end of the buffer is buffered up, and will be
// returned when calling write again with the remaining bytes.
//
// Note: Converting a Buffer containing an orphan surrogate to a String
// currently works, but converting a String to a Buffer (via `new Buffer`, or
// Buffer#write) will replace incomplete surrogates with the unicode
// replacement character. See https://codereview.chromium.org/121173009/ .
StringDecoder.prototype.write = function(buffer) {
  var charStr = '';
  var buflen = buffer.length;
  var charBuffer = this.charBuffer;
  var charLength = this.charLength;
  var charReceived = this.charReceived;
  var surrogateSize = this.surrogateSize;
  var encoding = this.encoding;
  // if our last write ended with an incomplete multibyte character
  while (charLength) {
    // determine how many remaining bytes this buffer has to offer for this char
    var diff = charLength - charReceived;
    var available = (buflen >= diff) ? diff : buflen;
 
    // add the new bytes to the char buffer
    buffer.copy(charBuffer, charReceived, 0, available);
    charReceived += available;
 
    if (charReceived < charLength) {
      // still not enough chars in this buffer? wait for more ...
 
      this.charLength = charLength;
      this.charReceived = charReceived;
 
      return '';
    }
 
    // remove bytes belonging to the current character from the buffer
    buffer = buffer.slice(available, buflen);
    buflen = buffer.length;
 
    // get the character that was split
    charStr = charBuffer.toString(encoding, 0, charLength);
 
    // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
    var charCode = charStr.charCodeAt(charStr.length - 1);
    if (charCode >= 0xD800 && charCode <= 0xDBFF) {
      charLength += surrogateSize;
      charStr = '';
      continue;
    }
    charReceived = charLength = 0;
 
    // if there are no more bytes in this buffer, just emit our char
    if (buflen === 0) {
      this.charLength = charLength;
      this.charReceived = charReceived;
 
      return charStr;
    }
  }
 
  // determine and set charLength / charReceived
  if (this.detectIncompleteChar(buffer))
    charLength = this.charLength;
  charReceived = this.charReceived;
 
  var end = buflen;
  if (charLength) {
    // buffer the incomplete character bytes we got
    buffer.copy(charBuffer, 0, buflen - charReceived, end);
    end -= charReceived;
  }
 
  this.charLength = charLength;
  charStr += buffer.toString(encoding, 0, end);
 
  var end = charStr.length - 1;
  var charCode = charStr.charCodeAt(end);
  // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
  if (charCode >= 0xD800 && charCode <= 0xDBFF) {
    charLength += surrogateSize;
    charReceived += surrogateSize;
    charBuffer.copy(charBuffer, surrogateSize, 0, surrogateSize);
    buffer.copy(charBuffer, 0, 0, surrogateSize);
 
    this.charLength = charLength;
    this.charReceived = charReceived;
 
    return charStr.substring(0, end);
  }
 
  // or just emit the charStr
  return charStr;
};
 
// detectIncompleteChar determines if there is an incomplete UTF-8 character at
// the end of the given buffer. If so, it sets this.charLength to the byte
// length that character, and sets this.charReceived to the number of bytes
// that are available for this character.
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
  var buflen = buffer.length;
  // determine how many bytes we have to check at the end of this buffer
  var i = (buflen >= 3) ? 3 : buflen;
  var newlen = false;
 
  // Figure out if one of the last i bytes of our buffer announces an
  // incomplete char.
  for (; i > 0; i--) {
    var c = buffer[buflen - i];
 
    // See http://en.wikipedia.org/wiki/UTF-8#Description
 
    // 110XXXXX
    if (i === 1 && c >> 5 === 0x06) {
      this.charLength = 2;
      newlen = true;
      break;
    }
 
    // 1110XXXX
    if (i <= 2 && c >> 4 === 0x0E) {
      this.charLength = 3;
      newlen = true;
      break;
    }
 
    // 11110XXX
    if (i <= 3 && c >> 3 === 0x1E) {
      this.charLength = 4;
      newlen = true;
      break;
    }
  }
 
  this.charReceived = i;
 
  return newlen;
};
 
StringDecoder.prototype.end = function(buffer) {
  var res = '';
  if (buffer && buffer.length)
    res = this.write(buffer);
 
  var charReceived = this.charReceived;
  if (charReceived) {
    var cr = charReceived;
    var buf = this.charBuffer;
    var enc = this.encoding;
    res += buf.toString(enc, 0, cr);
  }
 
  return res;
};
 
function passThroughWrite(buffer) {
  return buffer.toString(this.encoding);
}
 
function utf16DetectIncompleteChar(buffer) {
  var charReceived = this.charReceived = buffer.length % 2;
  this.charLength = charReceived ? 2 : 0;
  return true;
}
 
function base64DetectIncompleteChar(buffer) {
  var charReceived = this.charReceived = buffer.length % 3;
  this.charLength = charReceived ? 3 : 0;
  return true;
}
 
}());
Code coverage report for lib/string_decoder.js

Statements: 100% (111 / 111) Branches: 98.11% (52 / 53) Functions: 100% (8 / 8) Lines: 100% (111 / 111) Ignored: none