Commit 0a3fed68 authored by Alexander A. Maly's avatar Alexander A. Maly

Refactoring I/O

parent 8e2181df
Pipeline #2833 failed with stages
in 1 minute and 42 seconds
...@@ -136,13 +136,13 @@ MultiByte UTF8CodingTable::enc(uint32_t k, EncodingError &error) ...@@ -136,13 +136,13 @@ MultiByte UTF8CodingTable::enc(uint32_t k, EncodingError &error)
// for implementation details see: // for implementation details see:
// man utf-8 // man utf-8
error = NoEncodingError; error = NoEncodingError;
uint32_t v = static_cast<uint32_t>(k); uint32_t v = k;
MultiByte result; MultiByte result;
if (v <= 0x7F) { if (v <= 0x7F) {
// 0xxxxxxx // 0xxxxxxx
result.size = 1; result.size = 1;
result.data[0] = v & 0x7F; // 0x7F = 01111111 result.data[0] = v & 0x7F; // 0x7F = 01111111
} else if (v >= 0x80 && v <= 0x7FF) { } else if (v <= 0x7FF) {
// 110xxxxx,10xxxxxx // 110xxxxx,10xxxxxx
result.size = 2; result.size = 2;
result.data[1] = v & 0x3F; // 0x3F = 00111111 result.data[1] = v & 0x3F; // 0x3F = 00111111
...@@ -150,7 +150,7 @@ MultiByte UTF8CodingTable::enc(uint32_t k, EncodingError &error) ...@@ -150,7 +150,7 @@ MultiByte UTF8CodingTable::enc(uint32_t k, EncodingError &error)
v = v >> 6; v = v >> 6;
result.data[0] = v & 0x1F; // 0x1F = 00011111 result.data[0] = v & 0x1F; // 0x1F = 00011111
result.data[0] |= 0xC0; // 0xC0 = 11000000 result.data[0] |= 0xC0; // 0xC0 = 11000000
} else if (v >= 0x800 && v <= 0xFFFF) { } else if (v <= 0xFFFF) {
// 1110xxxx, 10xxxxxx, 10xxxxxx // 1110xxxx, 10xxxxxx, 10xxxxxx
result.size = 3; result.size = 3;
result.data[2] = v & 0x3F; // 0x3F = 00111111 result.data[2] = v & 0x3F; // 0x3F = 00111111
...@@ -189,27 +189,27 @@ uint32_t UTF8CodingTable::dec(charptr &from, EncodingError &error) ...@@ -189,27 +189,27 @@ uint32_t UTF8CodingTable::dec(charptr &from, EncodingError &error)
// first byte mask: 110xxxxx // first byte mask: 110xxxxx
// -- use two bytes // -- use two bytes
v = byte & 0x1F; // 0x1F = 000xxxxx v = byte & 0x1F; // 0x1F = 000xxxxx
if (from == 0 || (*from) == '\0') { byte = (*from++);
if (byte == '\0') {
error = StreamEnded; error = StreamEnded;
return L'?'; return L'?';
} }
byte = (*from++);
v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111 v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111
} else if (byte_first_4_bits == 0x0E) { } else if (byte_first_4_bits == 0x0E) {
// first byte mask: 1110xxxx // first byte mask: 1110xxxx
// -- use three bytes // -- use three bytes
v = byte & 0x0F; // 0x0F = 00001111 v = byte & 0x0F; // 0x0F = 00001111
if (from == 0 || (*from) == '\0') { byte = (*from++);
if (byte == '\0') {
error = StreamEnded; error = StreamEnded;
return L'?'; return L'?';
} }
byte = (*from++);
v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111 v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111
if (from == 0 || (*from) == '\0') { byte = (*from++);
if (byte == '\0') {
error = StreamEnded; error = StreamEnded;
return L'?'; return L'?';
} }
byte = (*from++);
v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111 v = (v << 6) | (byte & 0x3F); // 0x3F = 00111111
} else { } else {
// Something going wrong: // Something going wrong:
......
...@@ -86,7 +86,7 @@ void Files::init() ...@@ -86,7 +86,7 @@ void Files::init()
assignedIN = stdin; assignedIN = stdin;
assignedOUT = stdout; assignedOUT = stdout;
inputDelimiters = Kumir::Core::fromAscii(" \n\t"); inputDelimiters = Kumir::Core::fromAscii(" \n\r\t");
} }
void Files::finalize() void Files::finalize()
...@@ -574,8 +574,7 @@ void Files::reset(FileType &key) ...@@ -574,8 +574,7 @@ void Files::reset(FileType &key)
Core::abort(Core::fromUtf8("Неверный ключ")); Core::abort(Core::fromUtf8("Неверный ключ"));
return; return;
} }
const FileType &f = (*it); FILE *fh = it->handle;
FILE *fh = f.handle;
fseek(fh, 0, 0); fseek(fh, 0, 0);
} }
...@@ -592,21 +591,19 @@ bool Files::eof(const FileType &key) ...@@ -592,21 +591,19 @@ bool Files::eof(const FileType &key)
Core::abort(Core::fromUtf8("Неверный ключ")); Core::abort(Core::fromUtf8("Неверный ключ"));
return false; return false;
} }
const FileType &f = (*it); FILE *fh = it->handle;
FILE *fh = f.handle;
if (feof(fh)) { if (feof(fh)) {
return true; return true;
} }
unsigned char ch = 0x00;
if (fh != stdin) { int ch = fgetc(fh);
ch = fgetc(fh); if (ch < 0) {
ungetc(ch, fh); return true;
} else {
long pos = ftell(fh);
ch = fgetc(fh);
fseek(fh, pos, SEEK_SET);
} }
return ch == 0xFF;
ungetc(ch, fh);
return false;
} }
bool Files::hasData(const FileType &key) bool Files::hasData(const FileType &key)
...@@ -618,11 +615,26 @@ bool Files::hasData(const FileType &key) ...@@ -618,11 +615,26 @@ bool Files::hasData(const FileType &key)
break; break;
} }
} }
if (it == openedFiles.end()) { if (it == openedFiles.end()) {
Core::abort(Core::fromUtf8("Неверный ключ")); Core::abort(Core::fromUtf8("Неверный ключ"));
return false; return false;
} }
FILE *fh = (*it).handle; FILE *fh = it->handle;
#if 1
for (;;) {
int c = fgetc(fh);
if (c < 0) {
return false;
}
if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
continue;
}
ungetc(c, fh);
return true;
}
#else
long backPos = -1; long backPos = -1;
if (fh != stdin) { if (fh != stdin) {
backPos = ftell(fh); backPos = ftell(fh);
...@@ -660,6 +672,7 @@ bool Files::hasData(const FileType &key) ...@@ -660,6 +672,7 @@ bool Files::hasData(const FileType &key)
fseek(fh, backPos, SEEK_SET); fseek(fh, backPos, SEEK_SET);
} }
return result; return result;
#endif
} }
bool Files::overloadedStdIn() bool Files::overloadedStdIn()
...@@ -708,16 +721,27 @@ void Files::assignOutStream(String fileName) ...@@ -708,16 +721,27 @@ void Files::assignOutStream(String fileName)
} }
} }
IO::OutputStream::OutputStream(FILE *f, Encoding enc)
{
streamType_ = File;
file = f;
encoding = enc;
if (encoding == DefaultEncoding) {
encoding = UTF8;
}
externalBuffer_ = 0;
if (encoding == UTF8 && ftell(file) == 0) {
static const char *BOM = "\xEF\xBB\xBF";
fwrite(BOM, sizeof(char), 3, file);
}
}
void IO::OutputStream::writeRawString(const String &s) void IO::OutputStream::writeRawString(const String &s)
{ {
if (type() == File) { if (type() == File) {
if (encoding == UTF8 && ftell(file) == 0) {
static const char *BOM = "\xEF\xBB\xBF";
fwrite(BOM, sizeof(char), 3, file);
}
std::string bytes;
EncodingError encodingError; EncodingError encodingError;
bytes = Coder::encode(encoding, s, encodingError); std::string bytes = Coder::encode(encoding, s, encodingError);
if (encodingError) { if (encodingError) {
Core::abort(Core::fromUtf8("Ошибка кодирования строки вывода: недопустимый символ")); Core::abort(Core::fromUtf8("Ошибка кодирования строки вывода: недопустимый символ"));
} }
...@@ -739,15 +763,18 @@ IO::InputStream::InputStream(FILE *f, Encoding enc) ...@@ -739,15 +763,18 @@ IO::InputStream::InputStream(FILE *f, Encoding enc)
file_ = f; file_ = f;
externalBuffer_ = 0; externalBuffer_ = 0;
encoding_ = enc; encoding_ = enc;
lastChar_ = 0;
lastCharLength_ = 0;
lastCharHere_ = false;
if (encoding_ == DefaultEncoding) { if (encoding_ == DefaultEncoding) {
bool forceUtf8 = false; bool forceUtf8 = false;
if (f != stdin) { if (f != stdin && UTF8 != Core::getSystemEncoding()) {
long curpos = ftell(f); long curpos = ftell(f);
fseek(f, 0, SEEK_SET); fseek(f, 0, SEEK_SET);
unsigned char B[3]; unsigned char B[3];
if (fread(B, 1, 3, f) == 3) { if (fread(B, 1, 3, f) == 3) {
forceUtf8 = B[0] == 0xEF && B[1] == 0xBB && B[2] == 0xBF; forceUtf8 = (B[0] == 0xEF && B[1] == 0xBB && B[2] == 0xBF);
} }
fseek(f, curpos, SEEK_SET); fseek(f, curpos, SEEK_SET);
} }
...@@ -761,19 +788,13 @@ IO::InputStream::InputStream(FILE *f, Encoding enc) ...@@ -761,19 +788,13 @@ IO::InputStream::InputStream(FILE *f, Encoding enc)
errLength_ = 0; errLength_ = 0;
currentPosition_ = 0; currentPosition_ = 0;
if (f == stdin) { if (f != stdin) {
fileSize_ = -1; currentPosition_ = ftell(f);
} else {
long curpos = ftell(f);
fseek(f, 0L, SEEK_END);
fileSize_ = ftell(f);
fseek(f, curpos, SEEK_SET);
} }
} }
bool IO::InputStream::readRawChar(Char &x) bool IO::InputStream::readRawChar(Char &x)
{ {
lastCharBuffer_[0] = lastCharBuffer_[1] = lastCharBuffer_[2] = '\0';
if (type() == InternalBuffer) { if (type() == InternalBuffer) {
if ((size_t) currentPosition_ == buffer_.length()) { if ((size_t) currentPosition_ == buffer_.length()) {
return false; return false;
...@@ -785,24 +806,27 @@ bool IO::InputStream::readRawChar(Char &x) ...@@ -785,24 +806,27 @@ bool IO::InputStream::readRawChar(Char &x)
} else if (type() == ExternalBuffer) { } else if (type() == ExternalBuffer) {
return externalBuffer_->readRawChar(x); return externalBuffer_->readRawChar(x);
} else { } else {
if (feof(file_)) { if (lastCharHere_) {
return false; lastCharHere_ = false;
currentPosition_ += lastCharLength_;
x = lastChar_;
return true;
} }
long pos = ftell(file_); lastCharLength_ = 0;
if (fileSize_ != -1 && pos >= fileSize_) { char buf[4] = {0, 0, 0, 0};
int ch = fgetc(file_);
if (ch < 0) {
return false; return false;
} }
charptr buffer = reinterpret_cast<charptr>(&lastCharBuffer_); buf[0] = ch;
if (encoding_ != UTF8) { if (encoding_ != UTF8) {
// Read only one byte lastCharLength_ = 1;
lastCharBuffer_[0] = fgetc(file_); currentPosition_ += lastCharLength_;
uint8_t firstByte = lastCharBuffer_[0];
if (firstByte == 255 && fileSize_ == -1) {
return false;
}
} else { } else {
#if 0
// More complex... // More complex...
long cpos = ftell(file_); long cpos = 1; //ftell(file_);
fprintf(stderr, "cpos=%ld\n", cpos);
if (cpos == 0) { if (cpos == 0) {
// Try to read BOM // Try to read BOM
static const char *BOM = "\xEF\xBB\xBF"; static const char *BOM = "\xEF\xBB\xBF";
...@@ -816,45 +840,53 @@ bool IO::InputStream::readRawChar(Char &x) ...@@ -816,45 +840,53 @@ bool IO::InputStream::readRawChar(Char &x)
fseek(file_, 0, SEEK_SET); fseek(file_, 0, SEEK_SET);
} }
} }
lastCharBuffer_[0] = fgetc(file_); int ch = fgetc(file_);
uint8_t firstByte = lastCharBuffer_[0]; fprintf(stderr, "%s:%d ch=%d \n", __FILE__, __LINE__, ch);
uint8_t oneSymbMark = firstByte >> 5; if (ch < 0) {
uint8_t twoSymbMark = firstByte >> 4; return - 1;
if (firstByte == 255 && file_ == Files::getAssignedIn()) {
// Core::abort(Core::fromUtf8("Ошибка чтения данных: входной поток закончился"));
return false;
} else if (firstByte == 255) {
return false;
} }
lastCharBuffer_[0] = ch;
#endif
uint8_t firstByte = buf[0];
int extraBytes = 0; int extraBytes = 0;
if (firstByte > 127) { if (firstByte & 0x80) {
if (oneSymbMark & 0x06) { if ((firstByte >> 5) == 0x06) {
extraBytes = 1; extraBytes = 1;
} else if (twoSymbMark & 0x0E) { } else if ((firstByte >> 4) == 0x0E) {
extraBytes = 2; extraBytes = 2;
} else if ((firstByte >> 3) == 0x1E) {
extraBytes = 3;
} }
} }
lastCharLength_ = 1 + extraBytes;
currentPosition_ += lastCharLength_;
for (int i = 0; i < extraBytes; i++) { for (int i = 0; i < extraBytes; i++) {
if (feof(file_)) { ch = fgetc(file_);
if ((ch >> 6) != 2) {
Core::abort(Core::fromUtf8("Ошибка чтения данных из файла: UTF-8 файл поврежден")); Core::abort(Core::fromUtf8("Ошибка чтения данных из файла: UTF-8 файл поврежден"));
return false; return false;
} }
lastCharBuffer_[i + 1] = fgetc(file_); buf[i + 1] = ch;
} }
} }
std::string sb(buffer);
std::wstring res; std::string sb(buf);
EncodingError encodingError; EncodingError encodingError;
res = Coder::decode(encoding_, sb, encodingError); std::wstring res = Coder::decode(encoding_, sb, encodingError);
if (encodingError) { if (encodingError) {
Core::abort(Core::fromUtf8("Ошибка перекодирования при чтении данных из текстового файла")); Core::abort(Core::fromUtf8("Ошибка перекодирования при чтении данных из текстового файла"));
return false; return false;
} }
if (res.length() == 0) { if (res.length() != 1) {
Core::abort(Core::fromUtf8("Ошибка перекодирования при чтении данных из текстового файла")); Core::abort(Core::fromUtf8("Ошибка перекодирования при чтении данных из текстового файла"));
return false; return false;
} }
x = res.at(0); x = lastChar_ = res.at(0);
if (x == 0xfeff && currentPosition_ == lastCharLength_) {
// Initial BOM, transform it to space
x = lastChar_ = Char(' ');
}
return true; return true;
} }
} }
...@@ -868,6 +900,8 @@ void IO::InputStream::pushLastCharBack() ...@@ -868,6 +900,8 @@ void IO::InputStream::pushLastCharBack()
} else if (type() == ExternalBuffer) { } else if (type() == ExternalBuffer) {
externalBuffer_->pushLastCharBack(); externalBuffer_->pushLastCharBack();
} else { /* File */ } else { /* File */
#if 0
if (file_ == stdin) { if (file_ == stdin) {
if (lastCharBuffer_[2] != '\0') { if (lastCharBuffer_[2] != '\0') {
ungetc(lastCharBuffer_[2], file_); ungetc(lastCharBuffer_[2], file_);
...@@ -875,10 +909,19 @@ void IO::InputStream::pushLastCharBack() ...@@ -875,10 +909,19 @@ void IO::InputStream::pushLastCharBack()
if (lastCharBuffer_[1] != '\0') { if (lastCharBuffer_[1] != '\0') {
ungetc(lastCharBuffer_[1], file_); ungetc(lastCharBuffer_[1], file_);
} }
ungetc(lastCharBuffer_[0], file_); int res = ungetc(lastCharBuffer_[0], file_);
fprintf(stderr, "%s:%d ungetc=%d \n", __FILE__, __LINE__, res);
} else { } else {
fseek(file_, -1 * strlen(lastCharBuffer_), SEEK_CUR); fseek(file_, -1 * strlen(lastCharBuffer_), SEEK_CUR);
} }
#else
if (lastCharHere_) {
fprintf(stderr, "InputStream: cannot push back more than one character, doing nothing.");
} else {
lastCharHere_ = true;
currentPosition_ -= lastCharLength_;
}
#endif
} }
} }
...@@ -886,18 +929,14 @@ void IO::InputStream::pushLastCharBack() ...@@ -886,18 +929,14 @@ void IO::InputStream::pushLastCharBack()
String IO::InputStream::readUntil(const String &delimeters) String IO::InputStream::readUntil(const String &delimeters)
{ {
String result; String result;
result.reserve(100); result.reserve(10);
Char current; Char current;
while (readRawChar(current)) { while (readRawChar(current)) {
if (delimeters.find_first_of(current) != String::npos if (delimeters.find_first_of(current) != String::npos ) {
&& current != Char('\r')
) {
pushLastCharBack(); pushLastCharBack();
break; break;
} else { } else {
if (current != Char('\r')) { result.push_back(current);
result.push_back(current);
}
} }
} }
return result; return result;
...@@ -906,18 +945,17 @@ String IO::InputStream::readUntil(const String &delimeters) ...@@ -906,18 +945,17 @@ String IO::InputStream::readUntil(const String &delimeters)
void IO::InputStream::skipDelimiters(const String &del) void IO::InputStream::skipDelimiters(const String &del)
{ {
const String delim = del.empty() ? inputDelimiters : del; const String &delim = del.empty() ? inputDelimiters : del;
// Skip delimiters until lexem // Skip delimiters until lexem
Char skip(32); Char skip(32);
while (readRawChar(skip)) { while (readRawChar(skip)) {
if (delim.find_first_of(skip) == String::npos if (delim.find_first_of(skip) == String::npos) {
&& skip != Char('\r')
) {
pushLastCharBack(); pushLastCharBack();
break; break;
} }
} }
markPossibleErrorStart();
} }
...@@ -933,7 +971,6 @@ StringList IO::splitIntoLexemsByDelimeter( ...@@ -933,7 +971,6 @@ StringList IO::splitIntoLexemsByDelimeter(
if (s[i] == delim) { if (s[i] == delim) {
result.push_back(current); result.push_back(current);
current.clear(); current.clear();
current.reserve(10);
} else if (s[i] != ' ') { } else if (s[i] != ' ') {
current.push_back(s[i]); current.push_back(s[i]);
} }
...@@ -947,9 +984,8 @@ StringList IO::splitIntoLexemsByDelimeter( ...@@ -947,9 +984,8 @@ StringList IO::splitIntoLexemsByDelimeter(
String IO::readWord(InputStream &is) String IO::readWord(InputStream &is)
{ {
String delim = inputDelimiters; String delim = inputDelimiters;
is.skipDelimiters(delim);
// Mark as lexem begin position // Mark as lexem begin position
is.markPossibleErrorStart(); is.skipDelimiters(delim);
return is.readUntil(delim); return is.readUntil(delim);
} }
...@@ -957,9 +993,8 @@ String IO::readWord(InputStream &is) ...@@ -957,9 +993,8 @@ String IO::readWord(InputStream &is)
String IO::readString(InputStream &is) String IO::readString(InputStream &is)
{ {
String delim = inputDelimiters; String delim = inputDelimiters;
is.skipDelimiters(delim);
// Mark as lexem begin position // Mark as lexem begin position
is.markPossibleErrorStart(); is.skipDelimiters(delim);
Char bracket = Char('\0'); Char bracket = Char('\0');
if (!is.readRawChar(bracket)) { if (!is.readRawChar(bracket)) {
is.setError(Core::fromUtf8("Не могу прочитать литерал: текст закончился")); is.setError(Core::fromUtf8("Не могу прочитать литерал: текст закончился"));
...@@ -971,7 +1006,7 @@ String IO::readString(InputStream &is) ...@@ -971,7 +1006,7 @@ String IO::readString(InputStream &is)
} }
Char current; Char current;
String result; String result;
result.reserve(100); result.reserve(10);
while (is.readRawChar(current)) { while (is.readRawChar(current)) {
if (current != bracket) { if (current != bracket) {
result.push_back(current); result.push_back(current);
...@@ -980,10 +1015,7 @@ String IO::readString(InputStream &is) ...@@ -980,10 +1015,7 @@ String IO::readString(InputStream &is)
} }
} }
if (current != bracket) { if (current != bracket) {
// is.setError(Core::fromUtf8("Ошибка чтения литерала: текст закончился раньше, чем появилась закрывающая кавычка")); // is.setError(Core::fromUtf8("Ошибка чтения литерала: текст закончился раньше, чем появилась закрывающая кавычка"));
} else {
// Skip closing bracket
is.readRawChar(bracket);
} }
return result; return result;
} }
...@@ -991,16 +1023,17 @@ String IO::readString(InputStream &is) ...@@ -991,16 +1023,17 @@ String IO::readString(InputStream &is)
String IO::readLine(InputStream &is) String IO::readLine(InputStream &is)
{ {
String result; String result;
result.reserve(100); result.reserve(10);
Char current; Char current;