fix file name truncation in presence of non-ascii characters (#1402)

* fix file name truncation in presence of non-ascii characters

* review: remove unnecessary -4 of maxNameLength

* review: count byte length of extension

* review: fix typo three -> four

* review: rename byteLength -> maxLength

* review: fix case where last character is incorrectly truncated

* Add some unit tests
This commit is contained in:
Jakob Hellermann 2024-07-11 05:04:52 +02:00 committed by GitHub
parent 07804fc215
commit 716a07f50b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 102 additions and 4 deletions

View File

@ -0,0 +1,45 @@
using AssetRipper.IO.Files.Utils;
namespace AssetRipper.IO.Files.Tests;
public static class FileUtilsTests
{
[Test]
public static void FilenameTruncationMultibyteCharacter()
{
Assert.Multiple(() =>
{
// A length 3 cont cont cont
// 01000001 11100110 10010110 10000111 00001010
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 4), Is.EqualTo(".ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 5), Is.EqualTo("A.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 6), Is.EqualTo("A.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 7), Is.EqualTo("A.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 8), Is.EqualTo("A文.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 9), Is.EqualTo("A文.ext"));
});
}
[Test]
public static void ExtensionLength()
{
Assert.Multiple(() =>
{
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 7), Is.EqualTo("A.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.ext", 8), Is.EqualTo("A文.ext"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.exte", 8), Is.EqualTo("A.exte"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文.exte", 9), Is.EqualTo("A文.exte"));
});
}
[Test]
public static void WithoutExtension()
{
Assert.Multiple(() =>
{
Assert.That(FileUtils.GetUniqueName("/dir", "A文", 3), Is.EqualTo("A"));
Assert.That(FileUtils.GetUniqueName("/dir", "A文", 4), Is.EqualTo("A文"));
});
}
}

View File

@ -1,4 +1,4 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
namespace AssetRipper.IO.Files.Utils
@ -37,12 +37,11 @@ namespace AssetRipper.IO.Files.Utils
{
string? ext = null;
string? name = null;
int maxLength = maxNameLength - 4;
string validFileName = fileName;
if (validFileName.Length > maxLength)
if (Encoding.UTF8.GetByteCount(fileName) > maxNameLength)
{
ext = Path.GetExtension(validFileName);
name = validFileName.Substring(0, maxLength - ext.Length);
name = TruncateToUTF8ByteLength(fileName, maxNameLength - Encoding.UTF8.GetByteCount(ext));
validFileName = name + ext;
}
@ -120,5 +119,59 @@ namespace AssetRipper.IO.Files.Utils
"lpt1", "lpt2", "lpt3", "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9",
];
private static readonly Regex FileNameRegex = GenerateFileNameRegex();
private static string TruncateToUTF8ByteLength(string str, int maxLength)
{
byte[] bytes = Encoding.UTF8.GetBytes(str);
int validLength = FindValidByteLength(bytes, maxLength);
return Encoding.UTF8.GetString(bytes[..validLength]);
}
private static int FindValidByteLength(byte[] bytes, int maxLength)
{
int validLength = maxLength;
// ascii char: 0_
// two-byte char: 110_ 10_
// three-byte char: 1110_ 10_ _10_
// four-byte char : 11110_ 10_ _10_ _10
if (maxLength >= bytes.Length)
{
return bytes.Length;
}
// next byte is a beginning, so we can safely truncate to maxLength
byte nextByte = bytes[maxLength];
if ((nextByte & 0b11_000000) != 0b10_000000)
{
return maxLength;
}
// move to end of the last full sequence
for (int i = maxLength - 1; i >= 0; i--)
{
byte currentByte = bytes[i];
if ((currentByte & 0b11_000000) == 0b10_000000)
{
// continuation byte
validLength--;
}
else if ((currentByte & 0b10000000) == 0b10000000)
{
// start of multi-byte sequence
validLength--;
break;
}
else
{
// ascii char
break;
}
}
return validLength;
}
}
}