forked from marktext/marktext
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.js
More file actions
74 lines (66 loc) · 1.74 KB
/
encoding.js
File metadata and controls
74 lines (66 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import ced from 'ced'
const CED_ICONV_ENCODINGS = {
'BIG5-CP950': 'big5',
KSC: 'euckr',
'ISO-2022-KR': 'euckr',
GB: 'gb2312',
ISO_2022_CN: 'gb2312',
JIS: 'shiftjis',
SJS: 'shiftjis',
Unicode: 'utf8',
// Map ASCII to UTF-8
'ASCII-7-bit': 'utf8',
ASCII: 'utf8',
MACINTOSH: 'utf8'
}
// Byte Order Mark's to detect endianness and encoding.
const BOM_ENCODINGS = {
utf8: [0xEF, 0xBB, 0xBF],
utf16be: [0xFE, 0xFF],
utf16le: [0xFF, 0xFE]
}
const checkSequence = (buffer, sequence) => {
if (buffer.length < sequence.length) {
return false
}
return sequence.every((v, i) => v === buffer[i])
}
/**
* Guess the encoding from the buffer.
*
* @param {Buffer} buffer
* @param {boolean} autoGuessEncoding
* @returns {Encoding}
*/
export const guessEncoding = (buffer, autoGuessEncoding) => {
let isBom = false
let encoding = 'utf8'
// Detect UTF8- and UTF16-BOM encodings.
for (const [key, value] of Object.entries(BOM_ENCODINGS)) {
if (checkSequence(buffer, value)) {
return { encoding: key, isBom: true }
}
}
// // Try to detect binary files. Text files should not containt four 0x00 characters.
// let zeroSeenCounter = 0
// for (let i = 0; i < Math.min(buffer.byteLength, 256); ++i) {
// if (buffer[i] === 0x00) {
// if (zeroSeenCounter >= 3) {
// return { encoding: 'binary', isBom: false }
// }
// zeroSeenCounter++
// } else {
// zeroSeenCounter = 0
// }
// }
// Auto guess encoding, otherwise use UTF8.
if (autoGuessEncoding) {
encoding = ced(buffer)
if (CED_ICONV_ENCODINGS[encoding]) {
encoding = CED_ICONV_ENCODINGS[encoding]
} else {
encoding = encoding.toLowerCase().replace(/-_/g, '')
}
}
return { encoding, isBom }
}