1
// Copyright (c) 2005, Rodrigo Braz Monteiro
2
// All rights reserved.
4
// Redistribution and use in source and binary forms, with or without
5
// modification, are permitted provided that the following conditions are met:
7
// * Redistributions of source code must retain the above copyright notice,
8
// this list of conditions and the following disclaimer.
9
// * Redistributions in binary form must reproduce the above copyright notice,
10
// this list of conditions and the following disclaimer in the documentation
11
// and/or other materials provided with the distribution.
12
// * Neither the name of the Aegisub Group nor the names of its contributors
13
// may be used to endorse or promote products derived from this software
14
// without specific prior written permission.
16
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
// POSSIBILITY OF SUCH DAMAGE.
28
// -----------------------------------------------------------------------------
32
// Website: http://aegisub.cellosoft.com
33
// Contact: mailto:zeratul@cellosoft.com
44
#include "text_file_reader.h"
46
#ifdef WITH_UNIVCHARDET
47
#include "charset_detect.h"
53
TextFileReader::TextFileReader(wxString _filename,wxString enc,bool _trim) {
65
if (encoding.IsEmpty()) encoding = GetEncoding(filename);
66
if (encoding == _T("binary")) return;
67
SetEncodingConfiguration();
73
TextFileReader::~TextFileReader() {
76
// Clean up conversion
77
if (customConv) delete conv;
81
///////////////////////////
82
// Determine file encoding
83
wxString TextFileReader::GetEncoding(const wxString _filename) {
87
for (int i=0;i<4;i++) b[i] = 0;
89
// Read four bytes from file
90
#ifdef TEXT_READER_USE_STDIO
91
// TODO: maybe make this use posix-style fopen() api's instead as well?
92
HANDLE ifile = CreateFile(
93
_filename.c_str(), // filename
94
FILE_READ_DATA, // access mode
95
FILE_SHARE_READ, // share mode
96
0, // security descriptor
97
OPEN_EXISTING, // creation disposition
98
FILE_FLAG_SEQUENTIAL_SCAN, // flags
100
if (ifile == INVALID_HANDLE_VALUE) {
101
return _T("unknown");
104
if (!ReadFile(ifile, (char*)b, 4, &numread, 0)) {
106
return _T("unknown");
109
// File too short to decide, assume local
116
ifile.open(_filename.wc_str());
118
ifile.open(wxFNCONV(_filename));
120
if (!ifile.is_open()) {
121
return _T("unknown");
123
ifile.read((char*)b,4);
127
// Try to get the byte order mark from them
128
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return _T("UTF-8");
129
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return _T("UTF-32LE");
130
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return _T("UTF-32BE");
131
else if (b[0] == 0xFF && b[1] == 0xFE) return _T("UTF-16LE");
132
else if (b[0] == 0xFE && b[1] == 0xFF) return _T("UTF-16BE");
133
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return _T("UTF-7");
135
// Try to guess UTF-16
136
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return _T("UTF-16BE");
137
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return _T("UTF-16LE");
139
// If any of the first four bytes are under 0x20 (the first printable character),
140
// except for 9-13 range, assume binary
141
for (int i=0;i<4;i++) {
142
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return _T("binary");
145
#ifdef WITH_UNIVCHARDET
146
// Use universalchardet library to detect charset
148
return det.GetEncoding(_filename);
150
// Fall back to local
156
//////////////////////////////
157
// Set encoding configuration
158
void TextFileReader::SetEncodingConfiguration() {
159
// Set encoding configuration
164
if (encoding == _T("UTF-8")) {
165
conv = new wxMBConvUTF8;
168
else if (encoding == _T("UTF-16LE")) {
171
else if (encoding == _T("UTF-16BE")) {
175
else if (encoding == _T("UTF-7")) {
176
conv = new wxCSConv(encoding);
179
else if (encoding == _T("Local")) {
180
conv = wxConvCurrent;
183
conv = new wxCSConv(encoding);
189
//////////////////////////
190
// Reads a line from file
191
wxString TextFileReader::ReadLineFromFile() {
194
size_t bufAlloc = 1024;
195
wxbuffer.Alloc(bufAlloc);
196
#ifdef TEXT_READER_USE_STDIO
200
std::string buffer = "";
203
// Read UTF-16 line from file
209
#ifdef TEXT_READER_USE_STDIO
210
while (ch != L'\n' && !feof(file)) {
211
// Read two chars from file
212
fread(charbuffer, 2, 1, file);
214
while (ch != L'\n' && !file.eof()) {
215
// Read two chars from file
218
file.read(charbuffer,2);
221
// Swap bytes for big endian
223
register char aux = charbuffer[0];
224
charbuffer[0] = charbuffer[1];
228
// Convert two chars into a widechar and append to string
229
ch = *((wchar_t*)charbuffer);
230
if (len >= bufAlloc - 1) {
232
wxbuffer.Alloc(bufAlloc);
239
// Read ASCII/UTF-8 line from file
241
#ifdef TEXT_READER_USE_STDIO
244
if (fgets(buffer, 512, file)) {
246
// FIXME, this might break on incomplete multibyte characters
247
wxString linepart(buffer, *conv);
248
wxbuffer += linepart;
249
if (buffer[511] == '\1' || buffer[510] == '\n') {
250
// our sentinel \1 wasn't overwritten, meaning an EOL was found
253
// otherwise the sentinel \1 was overwritten (presumably with \0), so just loop on
261
getline(file,buffer);
263
if (buffer.length()) wxbuffer = wxString(buffer.c_str(),*conv);
267
// Remove line breaks
268
//wxbuffer.Replace(_T("\r"),_T("\0"));
269
//wxbuffer.Replace(_T("\n"),_T("\0"));
270
size_t len=wxbuffer.Length();
271
for (size_t i=0;i<len;i++) {
272
if (wxbuffer[i] == _T('\r') || wxbuffer[i] == _T('\n')) wxbuffer[i] = _T(' ');
276
if (wxbuffer.Length() > 0 && wxbuffer[0] == 0xFEFF) {
277
wxbuffer = wxbuffer.Mid(1);
283
wxbuffer.Trim(false);
291
void TextFileReader::Open() {
293
#ifdef TEXT_READER_USE_STDIO
294
// binary mode, because ascii mode is never to be trusted
295
file = _tfopen(filename.c_str(), _T("rb"));
297
throw _T("Failed opening file for reading.");
301
file.open(filename.wc_str(),std::ios::in | std::ios::binary);
303
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
305
if (!file.is_open()) {
306
throw _T("Failed opening file for reading.");
311
// Check if file seems binary
312
size_t binaryFactor = 0;
313
const size_t bufsize = 512;
315
file.get(buf, bufsize);
316
size_t bytes_read = file.gcount();
317
file.seekg(0, std::ios_base::beg);
318
for (size_t i = 0; i < bytes_read; ++i) {
319
if (((unsigned char)buf[i]) < 32) {
320
if (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t')
324
isBinary = (binaryFactor > 8) || (binaryFactor > bytes_read/8);
330
void TextFileReader::Close() {
332
#ifdef TEXT_READER_USE_STDIO
341
//////////////////////////////////
342
// Checks if there's more to read
343
bool TextFileReader::HasMoreLines() {
344
#ifdef TEXT_READER_USE_STDIO
345
if (encoding == _T("binary")) return false;
348
return (!file.eof());
353
////////////////////////////////
354
// Ensure that charset is valid
355
void TextFileReader::EnsureValid(wxString enc) {
356
if (enc == _T("unknown") || enc == _T("UTF-32BE") || enc == _T("UTF-32LE")) {
357
wxString error = _T("Character set ");
359
error += _T(" is not supported.");
365
///////////////////////////
366
// Get encoding being used
367
wxString TextFileReader::GetCurrentEncoding() {