1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | Lib/test/test_ucn.py
""" Test script for the Unicode implementation. Written by Bill Tutt. Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """#" import unittest import sys from test import test_support try: from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX except ImportError: INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1 class UnicodeNamesTest(unittest.TestCase): def checkletter(self, name, code): # Helper that put all \N escapes inside eval'd raw strings, # to make sure this script runs even if the compiler # chokes on \N escapes res = eval(ur'u"\N{%s}"' % name) self.assertEqual(res, code) return res def test_general(self): # General and case insensitivity test: chars = [ "LATIN CAPITAL LETTER T", "LATIN SMALL LETTER H", "LATIN SMALL LETTER E", "SPACE", "LATIN SMALL LETTER R", "LATIN CAPITAL LETTER E", "LATIN SMALL LETTER D", "SPACE", "LATIN SMALL LETTER f", "LATIN CAPITAL LeTtEr o", "LATIN SMaLl LETTER x", "SPACE", "LATIN SMALL LETTER A", "LATIN SMALL LETTER T", "LATIN SMALL LETTER E", "SPACE", "LATIN SMALL LETTER T", "LATIN SMALL LETTER H", "LATIN SMALL LETTER E", "SpAcE", "LATIN SMALL LETTER S", "LATIN SMALL LETTER H", "LATIN small LETTER e", "LATIN small LETTER e", "LATIN SMALL LETTER P", "FULL STOP" ] string = u"The rEd fOx ate the sheep." self.assertEqual( u"".join([self.checkletter(*args) for args in zip(chars, string)]), string ) def test_ascii_letters(self): import unicodedata for char in "".join(map(chr, xrange(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) self.assertEqual(unicodedata.name(code), name) def test_hangul_syllables(self): self.checkletter("HANGUL SYLLABLE GA", u"\uac00") self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8") self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0") self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8") self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0") self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88") self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370") self.checkletter("HANGUL SYLLABLE YI", u"\uc758") self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40") self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28") self.checkletter("HANGUL SYLLABLE PAN", u"\ud310") self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8") self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3") import unicodedata self.assertRaises(ValueError, unicodedata.name, u"\ud7a4") def test_cjk_unified_ideographs(self): self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400") self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5") self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00") self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5") self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000") self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6") def test_bmp_characters(self): import unicodedata count = 0 for code in xrange(0x10000): char = unichr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) count += 1 def test_misc_symbols(self): self.checkletter("PILCROW SIGN", u"\u00b6") self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD") self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F") self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41") def test_errors(self): import unicodedata self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, u'xx') self.assertRaises(TypeError, unicodedata.lookup) self.assertRaises(KeyError, unicodedata.lookup, u'unknown') def test_strict_eror_handling(self): # bogus character name self.assertRaises( UnicodeError, unicode, "\\N{blah}", 'unicode-escape', 'strict' ) # long bogus character name self.assertRaises( UnicodeError, unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict' ) # missing closing brace self.assertRaises( UnicodeError, unicode, "\\N{SPACE", 'unicode-escape', 'strict' ) # missing opening brace self.assertRaises( UnicodeError, unicode, "\\NSPACE", 'unicode-escape', 'strict' ) @test_support.cpython_only @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX") @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint") @test_support.bigmemtest(minsize=UINT_MAX + 1, memuse=2 + 4 // len(u'\U00010000')) def test_issue16335(self, size): func = self.test_issue16335 if size < func.minsize: raise unittest.SkipTest("not enough memory: %.1fG minimum needed" % (func.minsize * func.memuse / float(1024**3),)) # very very long bogus character name x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}' self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1)) self.assertRaisesRegexp(UnicodeError, 'unknown Unicode character name', x.decode, 'unicode-escape' ) def test_main(): test_support.run_unittest(UnicodeNamesTest) if __name__ == "__main__": test_main() |