diff options
author | Ladislav Zezula <zezula@volny.cz> | 2024-11-02 09:18:37 +0100 |
---|---|---|
committer | Ladislav Zezula <zezula@volny.cz> | 2024-11-02 09:18:37 +0100 |
commit | 2ec11ad1fd50098a789a9b477bb9c4240f569e7e (patch) | |
tree | 27cf630257df50b4af8950af1391ae18bebc7066 /src/SMemUtf8.cpp | |
parent | cc17c9bc5a6d5a85487beef60eab36e1c1513e00 (diff) |
Added functions for conversions between MPQ file name <-> Safe file name
Diffstat (limited to 'src/SMemUtf8.cpp')
-rw-r--r-- | src/SMemUtf8.cpp | 551 |
1 files changed, 551 insertions, 0 deletions
diff --git a/src/SMemUtf8.cpp b/src/SMemUtf8.cpp new file mode 100644 index 0000000..5832422 --- /dev/null +++ b/src/SMemUtf8.cpp @@ -0,0 +1,551 @@ +/*****************************************************************************/ +/* SFileVerify.cpp Copyright (c) Ladislav Zezula 2010 */ +/*---------------------------------------------------------------------------*/ +/* Support for conversion of UTF-8 <-> File name */ +/* */ +/* File names in the MPQs are assumed to be UTF-8. However, bad sequences */ +/* or filename unsafe characters are allowed in the list files, but won't */ +/* work in unpacking files from MPQ to a local file. */ +/* */ +/* This module contains cross-platform comparable conversion between UTF-8 */ +/* and file names that will produce identical file names across platforms. */ +/*---------------------------------------------------------------------------*/ +/* Date Ver Who Comment */ +/* -------- ---- --- ------- */ +/* 31.10.24 1.00 Lad Created */ +/*****************************************************************************/ + +#define __STORMLIB_SELF__ +#include "StormLib.h" +#include "StormCommon.h" + +//----------------------------------------------------------------------------- +// Local defines + +#define MAX_INVALID_CHARS 128 // Maximum number of invalid characters in a row + +//----------------------------------------------------------------------------- +// Conversion tables + +const unsigned char SMemCharToByte[0x80] = +{ + // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0xFF + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x10 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x20 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x30 + 0xFF, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x40 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x50 + 0xFF, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x60 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF // 0x70 +}; + +//----------------------------------------------------------------------------- +// Local functions + +// Bit mask of characters that are file name safe. We will maintain +// the same charset even on non-Windows in order to keep the file names equal +static unsigned int FileNameSafeChars[4] = +{ + 0x00000000, 0x2BFF7BFB, 0xFFFFFFFF, 0xEFFFFFFF // Windows: [0x20-0x7F], except 0x22, 0x2A, 0x2F, 0x3A, 0x3C, 0x3E, 0x3F, 0x7C +// 0xfffffffe, 0xffff7fff, 0xffffffff, 0xffffffff // Linux: [0x01-0x7F], except 0x2F +}; + +static bool UTF8_IsBadFileNameCharacter(DWORD ch) +{ + // It is guaranteed that the character is in range of 0x00 - 0x7F + assert(ch < 0x80); + + // Use the bit from the table + return (FileNameSafeChars[ch / 32] & (1 << (ch % 32))) ? false : true; +} + +static DWORD UTF8_DecodeSequence(const BYTE * pbString, BYTE BitsMask, size_t ccFollowBytes, DWORD dwMinValue, DWORD dwMaxValue, DWORD & dwCodePoint, size_t & ccBytesEaten) +{ + const BYTE * pbSaveString = pbString; + DWORD dwAccumulator; + + // Extract the low bits from the leading byte + dwAccumulator = pbString[0] & BitsMask; + ccBytesEaten = 1; + pbString++; + + // Process the follow-up bytes + for(size_t i = 0; i < ccFollowBytes; i++) + { + // Every follow-up byte in the UTF-8 sequence must start with 10xxxxxx + if((pbString[0] & 0xC0) != 0x80) + return ERROR_NO_UNICODE_TRANSLATION; + + // Add 6 bits to the accumulator + dwAccumulator = (dwAccumulator << 6) | (*pbString++ & 0x3F); + } + + // Check whether the code point is in the given range + if(!(dwMinValue <= dwAccumulator && dwAccumulator <= dwMaxValue)) + return ERROR_INVALID_DATA; + + // Give the number of bytes eaten and the decoded code point + ccBytesEaten = (pbString - pbSaveString); + dwCodePoint = dwAccumulator; + return ERROR_SUCCESS; +} + +// https://en.wikipedia.org/wiki/UTF-8 +static DWORD UTF8_DecodeCodePoint(const BYTE * pbString, const BYTE * pbStringEnd, DWORD & dwCodePoint, size_t & ccBytesEaten) +{ + // Reset the number of bytes eaten + dwCodePoint = SFILE_UTF8_INVALID_CHARACTER; + ccBytesEaten = 0; + + if(pbString < pbStringEnd) + { + // At least one byte will be eaten + ccBytesEaten = 1; + + // 1st code point (0x00 - 0x7F, 1 byte) + if(pbString[0] <= 0x7F) + { + // This is the perfect spot to check for filename-unsafe characters + if(UTF8_IsBadFileNameCharacter(pbString[0])) + return ERROR_NO_UNICODE_TRANSLATION; + + // Decode the 1-byte sequence + dwCodePoint = pbString[0]; + return ERROR_SUCCESS; + } + + // 2nd code point (0x80 - 0x7FF, 2 bytes) + if((pbString[0] & 0xE0) == 0xC0 && (pbString + 2) <= pbStringEnd) + { + // Decode the 2-byte sequence + return UTF8_DecodeSequence(pbString, 0x1F, 1, 0x80, 0x7FF, dwCodePoint, ccBytesEaten); + } + + // 3rd code point (0x800 - 0xFFFF, 3 bytes) + // Note: MultiByteToWideChar will not decode 0xE0 0xBF 0xBF (--> 0x0FFF), + if((pbString[0] & 0xF0) == 0xE0 && (pbString + 3) <= pbStringEnd) + { + // Decode the 3-byte sequence + return UTF8_DecodeSequence(pbString, 0x0F, 2, 0x800, 0xFFFF, dwCodePoint, ccBytesEaten); + } + + // 4th code point (0x10000 - 0x10FFFF, 4 bytes) + if((pbString[0] & 0xF8) == 0xF0 && (pbString + 4) <= pbStringEnd) + { + // Try to decode 4-byte sequence + return UTF8_DecodeSequence(pbString, 0x07, 3, 0x10000, SFILE_UNICODE_MAX, dwCodePoint, ccBytesEaten); + } + + // An invalid UTF-8 sequence encountered + return ERROR_NO_UNICODE_TRANSLATION; + } + + // No bytes available. Should never happen + assert(false); + return ERROR_BUFFER_OVERFLOW; +} + +static size_t UTF8_EncodeSequence(DWORD dwCodePoint, BYTE LeadingByte, DWORD dwFollowByteCount, LPBYTE Utf8Buffer) +{ + DWORD dwByteShift = dwFollowByteCount * 6; + + // Encode the highest byte + Utf8Buffer[0] = (BYTE)(LeadingByte | (dwCodePoint >> dwByteShift)); + dwByteShift -= 6; + + // Encode the follow bytes + for(DWORD i = 0; i < dwFollowByteCount; i++) + { + // The follow byte must be 10xxxxxx + Utf8Buffer[i + 1] = (BYTE)(0x80 | ((dwCodePoint >> dwByteShift) & 0x3F)); + dwByteShift -= 6; + } + + return dwFollowByteCount + 1; +} + +static size_t UTF8_EncodeCodePoint(DWORD dwCodePoint, LPBYTE Utf8Buffer) +{ + // 0x00 - 0x7F, 1 byte + if(dwCodePoint < 0x80) + return UTF8_EncodeSequence(dwCodePoint, 0x00, 0, Utf8Buffer); + + // 0x80 - 0x7FF + if(dwCodePoint < 0x800) + return UTF8_EncodeSequence(dwCodePoint, 0xC0, 1, Utf8Buffer); + + // 0x800 - 0xFFFF + if(dwCodePoint < 0x10000) + return UTF8_EncodeSequence(dwCodePoint, 0xE0, 2, Utf8Buffer); + + // 0x800 - 0xFFFF + if(dwCodePoint < 0x110000) + return UTF8_EncodeSequence(dwCodePoint, 0xF0, 3, Utf8Buffer); + + // Should never happen + assert(false); + return 0; +} + +static size_t UTF8_FlushInvalidChars(LPTSTR szBuffer, size_t ccBuffer, size_t nOutLength, LPBYTE InvalidChars, size_t nInvalidChars) +{ + // Case 0: No invalid char -> do nothing + if(nInvalidChars == 0) + { + return nOutLength; + } + + // Case 1: One invalid char -> %xx (compatible with previous versions of MPQ Editor) + if(nInvalidChars == 1) + { + // Space for 3 characters needed + if(szBuffer != NULL && (nOutLength + 3) <= ccBuffer) + { + szBuffer[nOutLength] = '%'; + SMemBinToStr(szBuffer + nOutLength + 1, ccBuffer - 1, InvalidChars, 1); + } + return nOutLength + 3; + } + + // Case 1: More than one invalid char -> %u[xxyyzz] + else + { + // Enough space for %u[xxyyzz] + size_t nLengthNeeded = nInvalidChars * 2 + 4; + + // Space for 4 characters needed + if(szBuffer != NULL && (nOutLength + nLengthNeeded) <= ccBuffer) + { + memcpy(szBuffer + nOutLength, _T("%u["), 6); + + SMemBinToStr(szBuffer + nOutLength + 3, ccBuffer - 3, InvalidChars, nInvalidChars); + + szBuffer[nOutLength + nLengthNeeded - 1] = ']'; + szBuffer[nOutLength + nLengthNeeded] = 0; + } + return nOutLength + nLengthNeeded; + } +} + +size_t UTF8_FlushBinBuffer(LPBYTE pbBuffer, size_t ccBuffer, size_t nOutLength, LPBYTE BinBuffer, size_t nByteCount) +{ + if(pbBuffer != NULL && (nOutLength + nByteCount) < ccBuffer) + memcpy(pbBuffer + nOutLength, BinBuffer, nByteCount); + return nOutLength + nByteCount; +} + +#ifdef STORMLIB_WIDE_CHAR +static size_t UTF16_EncodeCodePoint(DWORD dwCodePoint, unsigned short * Utf16Buffer) +{ + // https://en.wikipedia.org/wiki/UTF-16 + if(dwCodePoint <= 0xFFFF) + { + Utf16Buffer[0] = (unsigned short)(dwCodePoint); + return 1; + } + + if(dwCodePoint <= SFILE_UNICODE_MAX) + { + // Fix the code point + dwCodePoint -= 0x10000; + + // Split the code point to two 10-bit values + Utf16Buffer[0] = (unsigned short)(0xD800 + (dwCodePoint >> 10)); // High 6 bytes + Utf16Buffer[1] = (unsigned short)(0xDC00 + (dwCodePoint & 0x3FF)); // Low 10 bytes + return 2; + } + + // Should never happen + assert(false); + return 0; +} + +static DWORD UTF16_DecodeCodePoint(LPCTSTR szString, LPCTSTR szStringEnd, DWORD & dwCodePoint, size_t & ccCharsEaten) +{ + // Reset the number of bytes eaten + dwCodePoint = SFILE_UTF8_INVALID_CHARACTER; + ccCharsEaten = 0; + + if(szString < szStringEnd) + { + // At least one char will be eaten + ccCharsEaten = 1; + + // Check for an invalid surrogate pair + if(0xDC00 <= szString[0] && szString[0] <= 0xDFFF) + { + dwCodePoint = SFILE_UTF8_INVALID_CHARACTER; + return ERROR_NO_UNICODE_TRANSLATION; + } + + // Check for a valid surrogate pair + if(0xD800 <= szString[0] && szString[0] <= 0xDBFF && (szString + 1) < szStringEnd) + { + dwCodePoint = ((szString[0] - 0xD800) << 10) | (szString[1] - 0xDC00) + 0x10000; + ccCharsEaten = 2; + return ERROR_SUCCESS; + } + + // Direct encoding + dwCodePoint = szString[0]; + ccCharsEaten = 1; + return ERROR_SUCCESS; + } + + // No bytes available. Should never happen + assert(false); + return ERROR_BUFFER_OVERFLOW; +} +#endif + +size_t UTF16_IsEncodedCharSequence(LPCTSTR szString, LPCTSTR szStringEnd, LPBYTE BinBuffer) +{ + size_t nEncodedChars = 0; + + if((szString + 1) < szStringEnd && *szString++ == '%') + { + if((szString + 1) < szStringEnd && *szString++ == 'u') + { + if((szString + 1) < szStringEnd && *szString++ == '[') + { + // Keep going as long as we can convert + for(size_t i = 0; i < MAX_INVALID_CHARS; i++) + { + if(szString + (i * 2) >= szStringEnd) + break; + if(szString[i * 2] == ']') + break; + nEncodedChars++; + } + + // Did we encounter the end of the string? + if(szString + (nEncodedChars * 2) + 1 <= szStringEnd && szString[nEncodedChars * 2] == ']') + { + TCHAR HexaString[MAX_INVALID_CHARS * 2 + 1]; + + // Copy the hexadecimal string + memcpy(HexaString, szString, (nEncodedChars * 2) * sizeof(TCHAR)); + HexaString[nEncodedChars * 2] = 0; + + // Try to decode the hexa string + if(SMemStrToBin(HexaString, BinBuffer, nEncodedChars) == ERROR_SUCCESS) + { + return nEncodedChars; + } + } + } + } + } + return 0; +} + +//----------------------------------------------------------------------------- +// Public (exported) functions + +// Conversion of MPQ file name to file-name-safe string +DWORD WINAPI SMemUTF8ToFileName( + LPTSTR szBuffer, // Pointer to the output buffer. If NULL, the function will calulate the needed length + size_t ccBuffer, // Length of the output buffer (must include EOS) + const void * lpString, // Pointer to the begin of the string + const void * lpStringEnd, // Pointer to the end of string. If NULL, it's assumed to be zero-terminated + DWORD dwFlags, // Additional flags + size_t * pOutLength = NULL) // Pointer to a variable that receives the needed length (optional) +{ + const BYTE * pbStringEnd = (const BYTE *)lpStringEnd; + const BYTE * pbString = (const BYTE *)lpString; + DWORD dwErrCode = ERROR_SUCCESS; + size_t nInvalidChars = 0; + size_t nOutLength = 0; + BYTE InvalidChars[MAX_INVALID_CHARS]; + + // Set the end of the input if not specified + if(pbStringEnd == NULL) + pbStringEnd = pbString + strlen((char *)pbString); + + // Keep conversion as long + while(pbString < pbStringEnd) + { + size_t ccBytesEaten = 0; + size_t nCharLength; + DWORD dwCodePoint = 0; + + // Decode the single UTF-8 char + if((dwErrCode = UTF8_DecodeCodePoint(pbString, pbStringEnd, dwCodePoint, ccBytesEaten)) != ERROR_SUCCESS) + { + // Exactly one byte should be eaten on error + assert(ccBytesEaten == 1); + + // If invalid chars are allowed, we replace the result with 0xFFFD + if(dwFlags & SFILE_UTF8_ALLOW_INVALID_CHARS) + { + // Replace the code point with invalid marker and continue on the next character + dwCodePoint = SFILE_UTF8_INVALID_CHARACTER; + dwErrCode = ERROR_SUCCESS; + } + + // If the invalid chars are not allowed, we put the invalid char to the stack + else + { + // Flush the invalid characters, if full + if(nInvalidChars >= _countof(InvalidChars)) + { + nOutLength = UTF8_FlushInvalidChars(szBuffer, ccBuffer, nOutLength, InvalidChars, nInvalidChars); + nInvalidChars = 0; + } + + // Put the invalid char to the stack + InvalidChars[nInvalidChars++] = pbString[0]; + pbString++; + continue; + } + } + + // Check whether the unicode char is not out of range + assert(dwCodePoint <= SFILE_UNICODE_MAX); + + // Move the source pointer by the number of bytes eaten + pbString = pbString + ccBytesEaten; + + // Flush the invalid characters, if any + nOutLength = UTF8_FlushInvalidChars(szBuffer, ccBuffer, nOutLength, InvalidChars, nInvalidChars); + nInvalidChars = 0; + +#ifdef STORMLIB_WIDE_CHAR + { + unsigned short Utf16Buffer[2]; + + // Encode the code point into UTF-16 + nCharLength = UTF16_EncodeCodePoint(dwCodePoint, Utf16Buffer); + + // Write the encoded UTF-16 to the output buffer, if present + if(szBuffer != NULL && (nOutLength + nCharLength) < ccBuffer) + { + memcpy(szBuffer + nOutLength, Utf16Buffer, nCharLength * sizeof(unsigned short)); + } + } +#else + { + BYTE Utf8Buffer[4]; + + // Encode the code point into UTF-8 + nCharLength = UTF8_EncodeCodePoint(dwCodePoint, Utf8Buffer); + + // Write the encoded UTF-16 to the output buffer, if present + if(szBuffer != NULL && (nOutLength + nCharLength) < ccBuffer) + { + memcpy(szBuffer + nOutLength, Utf8Buffer, nCharLength); + } + } +#endif + + // Increment the output length + nOutLength = nOutLength + nCharLength; + } + + // Flush the invalid characters, if any + nOutLength = UTF8_FlushInvalidChars(szBuffer, ccBuffer, nOutLength, InvalidChars, nInvalidChars); + nInvalidChars = 0; + + // Terminate the string with zero, if we still have space + if(szBuffer != NULL && nOutLength < ccBuffer) + szBuffer[nOutLength] = 0; + nOutLength++; + + // Give the output length, if required + if(pOutLength != NULL) + pOutLength[0] = nOutLength; + return dwErrCode; +} + +DWORD WINAPI SMemFileNameToUTF8( + void * lpBuffer, // Pointer to the output buffer. If NULL, the function will calulate the needed length + size_t ccBuffer, // Length of the output buffer (must include EOS) + const TCHAR * szString, // Pointer to the begin of the string + const TCHAR * szStringEnd, // Pointer to the end of string. If NULL, it's assumed to be zero-terminated + DWORD /* dwFlags */, // Additional flags + size_t * pOutLength = NULL) // Pointer to a variable that receives the needed length in bytes (optional) +{ + LPBYTE pbBuffer = (LPBYTE)lpBuffer; + size_t nOutLength = 0; + DWORD dwErrCode = ERROR_SUCCESS; + + // Set the end of the input if not specified + if(szStringEnd == NULL) + szStringEnd = szString + _tcslen(szString); + + // Keep conversion as long + while(szString < szStringEnd) + { + size_t ccCharsEaten = 0; + size_t nUtf8Length; + DWORD dwCodePoint = 0; + BYTE Utf8Buffer[MAX_INVALID_CHARS]; + + // Check for encoded sequence of bytes + if(szString[0] == '%') + { + // If there is a single hexa number ("%c7"), decode that number + if((szString + 3) <= szStringEnd) + { + TCHAR HexaString[3] = {0}; + + HexaString[0] = szString[1]; + HexaString[1] = szString[2]; + if(SMemStrToBin(HexaString, Utf8Buffer, 1) == ERROR_SUCCESS) + { + nOutLength = UTF8_FlushBinBuffer(pbBuffer, ccBuffer, nOutLength, Utf8Buffer, 1); + szString += 3; + continue; + } + } + + // If there is an escaped sequence ("%u[aabbcc]"), decode that sequence + if((nUtf8Length = UTF16_IsEncodedCharSequence(szString, szStringEnd, Utf8Buffer)) != 0) + { + nOutLength = UTF8_FlushBinBuffer(pbBuffer, ccBuffer, nOutLength, Utf8Buffer, nUtf8Length); + szString += (nUtf8Length * 2) + 4; + continue; + } + } + +#ifdef STORMLIB_WIDE_CHAR + // Try to decode the code point from UTF-16 + if((dwErrCode = UTF16_DecodeCodePoint(szString, szStringEnd, dwCodePoint, ccCharsEaten)) != ERROR_SUCCESS) + return dwErrCode; +#else + // Try to decode the code point from UTF-16 + if((dwErrCode = UTF8_DecodeCodePoint((const BYTE *)szString, (const BYTE *)szStringEnd, dwCodePoint, ccCharsEaten)) != ERROR_SUCCESS) + return dwErrCode; +#endif + + // Check whether the unicode char is not out of range + assert(dwCodePoint <= SFILE_UNICODE_MAX); + + // Move the source pointer by the number of bytes eaten + szString = szString + ccCharsEaten; + + // Encode the UNICODE char + nUtf8Length = UTF8_EncodeCodePoint(dwCodePoint, Utf8Buffer); + + // Do we have enough space in the buffer? + if(pbBuffer != NULL && (nOutLength + nUtf8Length) < ccBuffer) + { + // Write the encoded UTF-16 to the output + memcpy(pbBuffer + nOutLength, Utf8Buffer, nUtf8Length); + } + + // Increment the output length + nOutLength = nOutLength + nUtf8Length; + } + + // Terminate the string with zero, if we still have space + if(pbBuffer != NULL && nOutLength < ccBuffer) + pbBuffer[nOutLength] = 0; + nOutLength++; + + // Give the output length, if required + if(pOutLength != NULL) + pOutLength[0] = nOutLength; + return dwErrCode; +} |