Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ class UnicodeFunctionsTest(unittest.TestCase):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = ('a91d306c268ba7d5cdf14d49e63b3f967058869c'
expectedchecksum = ('a5b8431ae6c0a0a78075c216193b7364a0497075'
if quicktest else
'232affd2a50ec4bd69d2482aa0291385cbdefaba')
'72241cd356ce6dad7d0570d206ce869169151850')

def test_function_checksum(self):
db = self.db
Expand Down Expand Up @@ -335,6 +335,12 @@ def test_decomposition(self):
# New in 15.0.0
self.assertEqual(self.db.decomposition('\U0001e06d'), '' if self.old else '<super> 04B1')

# Hangul characters
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')

self.assertRaises(TypeError, self.db.decomposition)
self.assertRaises(TypeError, self.db.decomposition, 'xx')

Expand Down Expand Up @@ -628,9 +634,9 @@ def test_east_asian_width_unassigned(self):
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
old = True
expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
expectedchecksum = ('883824cb6c0ccf994e4451ebf281e2d6d479af47'
if quicktest else
'b678d38ffbf1f1de092b2af1ed155602909fcd8d')
'44bbc0dfbfd746ba08180183482aa569a3830510')


class UnicodeMiscTest(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix :func:`unicodedata.decomposition` for Hangul characters.
44 changes: 33 additions & 11 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,17 @@
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}

// For Hangul decomposition
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)

/*[clinic input]
unicodedata.UCD.decomposition

Expand Down Expand Up @@ -416,6 +427,25 @@
return PyUnicode_FromString(""); /* unassigned */
}

// Hangul Decomposition.
// See section 3.12.2, "Hangul Syllable Decomposition"
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
if (SBase <= code && code < (SBase + SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
if (T != TBase) {
PyOS_snprintf(decomp, sizeof(decomp),
"%04X %04X %04X", L, V, T);
}
else {
PyOS_snprintf(decomp, sizeof(decomp),
"%04X %04X", L, V);
}
return PyUnicode_FromString(decomp);
}

if (code < 0 || code >= 0x110000)
index = 0;
else {
Expand Down Expand Up @@ -478,16 +508,6 @@
(*index)++;
}

#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)

static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
Expand Down Expand Up @@ -541,7 +561,9 @@
}
output = new_output;
}
/* Hangul Decomposition. */
// Hangul Decomposition.
// See section 3.12.2, "Hangul Syllable Decomposition"
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
if (SBase <= code && code < (SBase+SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
Expand Down Expand Up @@ -1451,7 +1473,7 @@
}

if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);

Check warning on line 1476 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / build (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1476 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1476 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1476 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / build (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]
if (find_prefix_id(v) != i) {
return 0;
}
Expand Down
Loading