/* SPDX-FileCopyrightText: 2010 Christoph Cullmann SPDX-License-Identifier: LGPL-2.0-or-later */ #ifndef KATE_TEXTLOADER_H #define KATE_TEXTLOADER_H #include #include #include #include #include #include #include #include "katetextbuffer.h" namespace Kate { /** * loader block size, load 256 kb at once per default * if file size is smaller, fall back to file size * must be a multiple of 2 */ static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; /** * File Loader, will handle reading of files + detecting encoding */ class TextLoader { public: /** * Construct file loader for given file. * @param filename file to open * @param proberType prober type * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones */ TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit) : m_eof(false) // default to not eof , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline , m_lastWasR(false) // we have not found a \r as last char , m_position(0) , m_lastLineStart(0) , m_eol(TextBuffer::eolUnknown) // no eol type detected atm , m_buffer(KATE_FILE_LOADER_BS, 0) , m_digest(QCryptographicHash::Sha1) , m_bomFound(false) , m_firstRead(true) , m_proberType(proberType) , m_fileSize(0) , m_lineLengthLimit(lineLengthLimit) { // try to get mimetype for on the fly decompression, don't rely on filename! QFile testMime(filename); if (testMime.open(QIODevice::ReadOnly)) { m_fileSize = testMime.size(); } m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name(); // construct filter device KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(m_mimeType); m_file = new KCompressionDevice(filename, compressionType); } /** * Destructor */ ~TextLoader() { delete m_file; } /** * open file with given codec * @param codec codec to use, if 0, will do some auto-detect or fallback * @return success */ bool open(const QString &codec) { m_codec = codec; m_eof = false; m_lastWasEndOfLine = true; m_lastWasR = false; m_position = 0; m_lastLineStart = 0; m_alreadyScanned = -1; m_eol = TextBuffer::eolUnknown; m_text.clear(); m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData()); m_bomFound = false; m_firstRead = true; // init the hash with the git header const QString header = QStringLiteral("blob %1").arg(m_fileSize); m_digest.reset(); m_digest.addData(QByteArray(header.toLatin1() + '\0')); // if already opened, close the file... if (m_file->isOpen()) { m_file->close(); } return m_file->open(QIODevice::ReadOnly); } /** * end of file reached? * @return end of file reached */ bool eof() const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); } /** * Detected end of line mode for this file. * Detected during reading, is valid after complete file is read. * @return eol mode of this file */ TextBuffer::EndOfLineMode eol() const { return m_eol; } /** * BOM found? * @return byte order mark found? */ bool byteOrderMarkFound() const { return m_bomFound; } /** * mime type used to create filter dev * @return mime-type of filter device */ const QString &mimeTypeForFilterDev() const { return m_mimeType; } /** * internal Unicode data array * @return internal Unicode data */ const QChar *unicode() const { return m_text.unicode(); } /** * Get codec for this loader * @return currently in use codec of this loader */ QString textCodec() const { return m_codec; } /** * read a line, return length + offset in Unicode data * @param offset offset into internal Unicode data for read line * @param length length of read line * @param tooLongLinesWrapped was a too long line seen? * @param longestLineLoaded length of the longest line that hit the limit * @return true if no encoding errors occurred */ bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded) { length = 0; offset = 0; bool encodingError = false; static const QLatin1Char cr(QLatin1Char('\r')); static const QLatin1Char lf(QLatin1Char('\n')); /** * did we read two time but got no stuff? encoding error * fixes problem with one character latin-1 files, which lead to crash otherwise! * bug 272579 */ bool failedToConvertOnce = false; /** * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true * BUG: 440359 */ bool bomPreviouslyFound = m_bomFound; // honor the line length limit early const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) { if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) { return false; } // remember stick error tooLongLinesWrapped = true; longestLineLoaded = std::max(longestLineLoaded, textLength); // search for place to wrap int spacePosition = m_lineLengthLimit - 1; for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10))); --testPosition) { // wrap place found? if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) { spacePosition = testPosition; break; } } m_lastWasEndOfLine = false; m_lastWasR = false; // line data offset = lineStart; length = spacePosition + 1; m_lastLineStart = m_position = (lineStart + length); return true; }; /** * reading loop */ while (m_position <= m_text.length()) { // handle too long lines early even if we not yet have seen the end if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) { return !encodingError; } if (m_position == m_text.length()) { // try to load more text if something is around if (!m_eof) { // kill the old lines... m_text.remove(0, m_lastLineStart); // try to read new data const int c = m_file->read(m_buffer.data(), m_buffer.size()); // if any text is there, append it.... if (c > 0) { // update hash sum m_digest.addData(QByteArrayView(m_buffer.data(), c)); // detect byte order marks & codec for byte order marks on first read if (m_firstRead) { // if no codec given, do autodetection if (!m_converterState.isValid()) { // use KEncodingProber first, QStringDecoder::decoderForHtml does fallback to UTF-8 KEncodingProber prober(m_proberType); prober.feed(QByteArrayView(m_buffer.data(), c)); // we found a codec with some confidence? if (const QStringDecoder decoder(prober.encoding().constData()); decoder.isValid() && (prober.confidence() > 0.5)) { m_converterState = QStringDecoder(prober.encoding().constData()); } else { // try to get HTML encoding, will default to UTF-8 // see https://doc.qt.io/qt-6/qstringdecoder.html#decoderForHtml m_converterState = QStringDecoder::decoderForHtml(m_buffer); } // no codec, no chance, encoding error, else remember the codec name if (!m_converterState.isValid()) { return false; } } // we want to convert the bom for later detection m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom); // remember name, might have changed m_codec = QString::fromUtf8(m_converterState.name()); } // detect broken encoding Q_ASSERT(m_converterState.isValid()); const QString unicode = m_converterState.decode(QByteArrayView(m_buffer.data(), c)); encodingError = encodingError || m_converterState.hasError(); // check and remove bom if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) { m_bomFound = true; m_text.append(QStringView(unicode).last(unicode.size() - 1)); // swapped BOM is encoding error encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped; } else { m_text.append(unicode); } m_firstRead = false; } // is file completely read ? m_eof = (c == -1) || (c == 0); // recalc current pos and last pos m_position -= m_lastLineStart; m_alreadyScanned = m_position - 1; m_lastLineStart = 0; } // oh oh, end of file, escape ! if (m_eof && (m_position == m_text.length())) { m_lastWasEndOfLine = false; // line data offset = m_lastLineStart; length = m_position - m_lastLineStart; m_lastLineStart = m_position; lineLimitHandler(offset, length); return !encodingError && !failedToConvertOnce; } // empty? try again if (m_position == m_text.length()) { if (!bomPreviouslyFound && m_bomFound) { // BOM was processed above, so we didn't fail to convert bomPreviouslyFound = true; } else { failedToConvertOnce = true; } continue; } } for (; m_position < m_text.length(); m_position++) { m_alreadyScanned = m_position; QChar current_char = m_text.at(m_position); if (current_char == lf) { m_lastWasEndOfLine = true; if (m_lastWasR) { m_lastLineStart++; m_lastWasR = false; m_eol = TextBuffer::eolDos; } else { // line data offset = m_lastLineStart; length = m_position - m_lastLineStart; m_lastLineStart = m_position + 1; m_position++; // only win, if not dos! if (m_eol != TextBuffer::eolDos) { m_eol = TextBuffer::eolUnix; } lineLimitHandler(offset, length); return !encodingError; } } else if (current_char == cr) { m_lastWasEndOfLine = true; m_lastWasR = true; // line data offset = m_lastLineStart; length = m_position - m_lastLineStart; m_lastLineStart = m_position + 1; m_position++; // should only win of first time! if (m_eol == TextBuffer::eolUnknown) { m_eol = TextBuffer::eolMac; } lineLimitHandler(offset, length); return !encodingError; } else if (current_char == QChar::LineSeparator) { m_lastWasEndOfLine = true; // line data offset = m_lastLineStart; length = m_position - m_lastLineStart; m_lastLineStart = m_position + 1; m_position++; lineLimitHandler(offset, length); return !encodingError; } else { m_lastWasEndOfLine = false; m_lastWasR = false; } } } return !encodingError; } QByteArray digest() { return m_digest.result(); } private: QString m_codec; bool m_eof; bool m_lastWasEndOfLine; bool m_lastWasR; int m_position; int m_lastLineStart; int m_alreadyScanned = -1; TextBuffer::EndOfLineMode m_eol; QString m_mimeType; QIODevice *m_file; QByteArray m_buffer; QCryptographicHash m_digest; QString m_text; QStringDecoder m_converterState; bool m_bomFound; bool m_firstRead; KEncodingProber::ProberType m_proberType; quint64 m_fileSize; const int m_lineLengthLimit; }; } #endif