From 59681640fa2f4acad925d8476b35778c4603822e Mon Sep 17 00:00:00 2001 From: Ben McIlwain Date: Thu, 6 Apr 2023 15:49:36 -0400 Subject: [PATCH] Add a new Unconfusable Latin table (#1981) This new table has just been approved by ICANN. It is the same as our existing Extended Latin table, except with the removal of some lesser-used characters with diacritic marks that are confusable variants. The filenames for the IDN tables are made explicit to improve code readability. And this reverses the removal of G with stroke from the existing Extended Latin table (see PR #1938), so that that table continues to accurately reflect the state of our previously launched TLDs. This is the full list of removed characters: U+00E1 # LATIN SMALL LETTER A WITH ACUTE U+0101 # LATIN SMALL LETTER A WITH MACRON U+01CE # LATIN SMALL LETTER A WITH CARON U+010B # LATIN SMALL LETTER C WITH DOT ABOVE U+01E7 # LATIN SMALL LETTER G WITH CARON U+0123 # LATIN SMALL LETTER G WITH CEDILLA U+01E5 # LATIN SMALL LETTER G WITH STROKE U+0131 # LATIN SMALL LETTER DOTLESS I U+00ED # LATIN SMALL LETTER I WITH ACUTE U+00EF # LATIN SMALL LETTER I WITH DIAERESIS U+01D0 # LATIN SMALL LETTER I WITH CARON U+0144 # LATIN SMALL LETTER N WITH ACUTE U+014B # LATIN SMALL LETTER ENG U+00F3 # LATIN SMALL LETTER O WITH ACUTE U+014D # LATIN SMALL LETTER O WITH MACRON U+01D2 # LATIN SMALL LETTER O WITH CARON U+0157 # LATIN SMALL LETTER R WITH CEDILLA U+0163 # LATIN SMALL LETTER T WITH CEDILLA U+00FA # LATIN SMALL LETTER U WITH ACUTE U+00FC # LATIN SMALL LETTER U WITH DIAERESIS U+01D4 # LATIN SMALL LETTER U WITH CARON U+1E83 # LATIN SMALL LETTER W WITH ACUTE U+1E81 # LATIN SMALL LETTER W WITH GRAVE U+1E85 # LATIN SMALL LETTER W WITH DIAERESIS U+1EF3 # LATIN SMALL LETTER Y WITH GRAVE U+017C # LATIN SMALL LETTER Z WITH DOT ABOVE --- .../java/google/registry/idn/Latin-IDN.txt | 1 + .../registry/tldconfig/idn/IdnTableEnum.java | 39 +++++- .../registry/tldconfig/idn/extended_latin.txt | 1 + .../tldconfig/idn/{ja.txt => japanese.txt} | 0 .../tldconfig/idn/unconfusable_latin.txt | 124 ++++++++++++++++++ .../google/registry/beam/rde/reducer_rde.xml | 7 +- .../registry/beam/rde/reducer_rde_report.xml | 2 +- ...pReduce_withDomain_producesExpectedXml.xml | 7 +- ...MapReduce_withDomain_producesReportXml.xml | 2 +- .../xn--q9jyb4c_2010-10-17_full_S1_R0.xml | 7 +- 10 files changed, 178 insertions(+), 12 deletions(-) rename core/src/main/java/google/registry/tldconfig/idn/{ja.txt => japanese.txt} (100%) create mode 100644 core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt diff --git a/core/src/main/java/google/registry/idn/Latin-IDN.txt b/core/src/main/java/google/registry/idn/Latin-IDN.txt index 70c0b1de9..5b240ba3c 100644 --- a/core/src/main/java/google/registry/idn/Latin-IDN.txt +++ b/core/src/main/java/google/registry/idn/Latin-IDN.txt @@ -68,6 +68,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE U+01E7 # LATIN SMALL LETTER G WITH CARON U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE U+0123 # LATIN SMALL LETTER G WITH CEDILLA +U+01E5 # LATIN SMALL LETTER G WITH STROKE U+0068 # LATIN SMALL LETTER H U+0127 # LATIN SMALL LETTER H WITH STROKE U+0069 # LATIN SMALL LETTER I diff --git a/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java b/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java index f27208ade..cba8b1865 100644 --- a/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java +++ b/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java @@ -24,23 +24,48 @@ import java.net.URL; /** Wrapper enum that loads all {@link IdnTable} resources into memory. */ public enum IdnTableEnum { - EXTENDED_LATIN, - JA; + + /** + * Extended Latin, as used on our existing TLD launches prior to 2023. + * + *

As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so + * it is retained solely for legacy compatibility with already-launched TLDs. + */ + EXTENDED_LATIN("extended_latin.txt"), + + /** + * Extended Latin, but with confusable characters removed. + * + *

This is compatible with ICANN's requirements as of 2023, and is used for the Dads and Grads + * TLDs and all subsequent TLD launches. Note that confusable characters consist of various + * letters with diacritic marks on them, e.g. U+00EF (LATIN SMALL LETTER I WITH DIAERESIS) is not + * allowed because it is confusable with the standard i. + */ + UNCONFUSABLE_LATIN("unconfusable_latin.txt"), + + /** + * Japanese, as used on our existing TLD launches prior to 2023. + * + *

As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so + * it is retained solely for legacy compatibility with already-launched TLDs. + */ + JA("japanese.txt"); private final IdnTable table; - IdnTableEnum() { - this.table = load(Ascii.toLowerCase(name())); + IdnTableEnum(String filename) { + this.table = load(Ascii.toLowerCase(name()), filename); } public IdnTable getTable() { return table; } - private static IdnTable load(String name) { + private static IdnTable load(String tableName, String filename) { try { - URL resource = Resources.getResource(IdnTableEnum.class, name + ".txt"); - return IdnTable.createFrom(name, readLines(resource, UTF_8), LanguageValidator.get(name)); + URL resource = Resources.getResource(IdnTableEnum.class, filename); + return IdnTable.createFrom( + tableName, readLines(resource, UTF_8), LanguageValidator.get(tableName)); } catch (IOException e) { throw new RuntimeException(e); // should never happen } diff --git a/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt b/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt index 2bd2d765f..ea035c6d1 100644 --- a/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt +++ b/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt @@ -49,6 +49,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE U+01E7 # LATIN SMALL LETTER G WITH CARON U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE U+0123 # LATIN SMALL LETTER G WITH CEDILLA +U+01E5 # LATIN SMALL LETTER G WITH STROKE U+0068 # LATIN SMALL LETTER H U+0127 # LATIN SMALL LETTER H WITH STROKE U+0069 # LATIN SMALL LETTER I diff --git a/core/src/main/java/google/registry/tldconfig/idn/ja.txt b/core/src/main/java/google/registry/tldconfig/idn/japanese.txt similarity index 100% rename from core/src/main/java/google/registry/tldconfig/idn/ja.txt rename to core/src/main/java/google/registry/tldconfig/idn/japanese.txt diff --git a/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt b/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt new file mode 100644 index 000000000..4f3443405 --- /dev/null +++ b/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt @@ -0,0 +1,124 @@ +# Registry: Charleston Road Registry Inc. +# Script: Latn +# Version: 2.0 +# Effective Date: 2023-04-04 +# URL: https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt +# Policy: https://www.registry.google/about/policies/domainabuse/ +# Contact Name: CRR Tech +# Email address: crr-tech@google.com +# Telephone: +1 (650) 253-0000 +# +# Code points requiring context rules +# +# Code point Description of rule/Reference +# +# U+002D Label must neither start nor end with U+002D. Label +# HYPHEN-MINUS must not have U+002D in both third and fourth +# position. RFC 5891 (sec 4.2.3.1) +# + +U+002D # HYPHEN-MINUS +U+0030 # DIGIT ZERO +U+0031 # DIGIT ONE +U+0032 # DIGIT TWO +U+0033 # DIGIT THREE +U+0034 # DIGIT FOUR +U+0035 # DIGIT FIVE +U+0036 # DIGIT SIX +U+0037 # DIGIT SEVEN +U+0038 # DIGIT EIGHT +U+0039 # DIGIT NINE +U+0061 # LATIN SMALL LETTER A +U+00E0 # LATIN SMALL LETTER A WITH GRAVE +U+0103 # LATIN SMALL LETTER A WITH BREVE +U+00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +U+00E5 # LATIN SMALL LETTER A WITH RING ABOVE +U+00E4 # LATIN SMALL LETTER A WITH DIAERESIS +U+00E3 # LATIN SMALL LETTER A WITH TILDE +U+0105 # LATIN SMALL LETTER A WITH OGONEK +U+00E6 # LATIN SMALL LETTER AE +U+0062 # LATIN SMALL LETTER B +U+0063 # LATIN SMALL LETTER C +U+0107 # LATIN SMALL LETTER C WITH ACUTE +U+010D # LATIN SMALL LETTER C WITH CARON +U+00E7 # LATIN SMALL LETTER C WITH CEDILLA +U+0064 # LATIN SMALL LETTER D +U+010F # LATIN SMALL LETTER D WITH CARON +U+0111 # LATIN SMALL LETTER D WITH STROKE +U+00F0 # LATIN SMALL LETTER ETH +U+0065 # LATIN SMALL LETTER E +U+00E9 # LATIN SMALL LETTER E WITH ACUTE +U+00E8 # LATIN SMALL LETTER E WITH GRAVE +U+00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +U+011B # LATIN SMALL LETTER E WITH CARON +U+00EB # LATIN SMALL LETTER E WITH DIAERESIS +U+0119 # LATIN SMALL LETTER E WITH OGONEK +U+0113 # LATIN SMALL LETTER E WITH MACRON +U+0117 # LATIN SMALL LETTER E WITH DOT ABOVE +U+0259 # LATIN SMALL LETTER SCHWA +U+0066 # LATIN SMALL LETTER F +U+0067 # LATIN SMALL LETTER G +U+011F # LATIN SMALL LETTER G WITH BREVE +U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE +U+0068 # LATIN SMALL LETTER H +U+0127 # LATIN SMALL LETTER H WITH STROKE +U+0069 # LATIN SMALL LETTER I +U+00EC # LATIN SMALL LETTER I WITH GRAVE +U+00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +U+012F # LATIN SMALL LETTER I WITH OGONEK +U+012B # LATIN SMALL LETTER I WITH MACRON +U+006A # LATIN SMALL LETTER J +U+006B # LATIN SMALL LETTER K +U+01E9 # LATIN SMALL LETTER K WITH CARON +U+0137 # LATIN SMALL LETTER K WITH CEDILLA +U+006C # LATIN SMALL LETTER L +U+013A # LATIN SMALL LETTER L WITH ACUTE +U+013E # LATIN SMALL LETTER L WITH CARON +U+013C # LATIN SMALL LETTER L WITH CEDILLA +U+0142 # LATIN SMALL LETTER L WITH STROKE +U+006D # LATIN SMALL LETTER M +U+006E # LATIN SMALL LETTER N +U+0148 # LATIN SMALL LETTER N WITH CARON +U+00F1 # LATIN SMALL LETTER N WITH TILDE +U+0146 # LATIN SMALL LETTER N WITH CEDILLA +U+006F # LATIN SMALL LETTER O +U+00F2 # LATIN SMALL LETTER O WITH GRAVE +U+00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +U+00F6 # LATIN SMALL LETTER O WITH DIAERESIS +U+0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE +U+00F5 # LATIN SMALL LETTER O WITH TILDE +U+00F8 # LATIN SMALL LETTER O WITH STROKE +U+0153 # LATIN SMALL LIGATURE OE +U+0070 # LATIN SMALL LETTER P +U+0071 # LATIN SMALL LETTER Q +U+0072 # LATIN SMALL LETTER R +U+0155 # LATIN SMALL LETTER R WITH ACUTE +U+0159 # LATIN SMALL LETTER R WITH CARON +U+0073 # LATIN SMALL LETTER S +U+015B # LATIN SMALL LETTER S WITH ACUTE +U+0161 # LATIN SMALL LETTER S WITH CARON +U+015F # LATIN SMALL LETTER S WITH CEDILLA +U+0074 # LATIN SMALL LETTER T +U+0165 # LATIN SMALL LETTER T WITH CARON +U+0167 # LATIN SMALL LETTER T WITH STROKE +U+0075 # LATIN SMALL LETTER U +U+00F9 # LATIN SMALL LETTER U WITH GRAVE +U+00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +U+016F # LATIN SMALL LETTER U WITH RING ABOVE +U+0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE +U+0173 # LATIN SMALL LETTER U WITH OGONEK +U+016B # LATIN SMALL LETTER U WITH MACRON +U+0076 # LATIN SMALL LETTER V +U+0077 # LATIN SMALL LETTER W +U+0175 # LATIN SMALL LETTER W WITH CIRCUMFLEX +U+0078 # LATIN SMALL LETTER X +U+0079 # LATIN SMALL LETTER Y +U+00FD # LATIN SMALL LETTER Y WITH ACUTE +U+0177 # LATIN SMALL LETTER Y WITH CIRCUMFLEX +U+00FF # LATIN SMALL LETTER Y WITH DIAERESIS +U+007A # LATIN SMALL LETTER Z +U+017A # LATIN SMALL LETTER Z WITH ACUTE +U+017E # LATIN SMALL LETTER Z WITH CARON +U+0292 # LATIN SMALL LETTER EZH +U+01EF # LATIN SMALL LETTER EZH WITH CARON +U+00FE # LATIN SMALL LETTER THORN diff --git a/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml b/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml index df14ad1e3..ffb8b40f1 100644 --- a/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml +++ b/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml @@ -21,6 +21,11 @@ https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt + https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt https://www.registry.google/about/policies/domainabuse/ @@ -32,7 +37,7 @@ 1 1 1 - 2 + 3 diff --git a/core/src/test/resources/google/registry/beam/rde/reducer_rde_report.xml b/core/src/test/resources/google/registry/beam/rde/reducer_rde_report.xml index 851c3b765..fe9b190b8 100644 --- a/core/src/test/resources/google/registry/beam/rde/reducer_rde_report.xml +++ b/core/src/test/resources/google/registry/beam/rde/reducer_rde_report.xml @@ -14,6 +14,6 @@ 1 1 1 - 2 + 3 diff --git a/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesExpectedXml.xml b/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesExpectedXml.xml index ddaf015ca..402859c2f 100644 --- a/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesExpectedXml.xml +++ b/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesExpectedXml.xml @@ -245,6 +245,11 @@ https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt + https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt https://www.registry.google/about/policies/domainabuse/ @@ -256,7 +261,7 @@ 1 2 2 - 2 + 3 diff --git a/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesReportXml.xml b/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesReportXml.xml index fe51911d2..7032ee59b 100644 --- a/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesReportXml.xml +++ b/core/src/test/resources/google/registry/rde/testMapReduce_withDomain_producesReportXml.xml @@ -37,6 +37,6 @@ 1 2 2 - 2 + 3 diff --git a/core/src/test/resources/google/registry/tools/server/xn--q9jyb4c_2010-10-17_full_S1_R0.xml b/core/src/test/resources/google/registry/tools/server/xn--q9jyb4c_2010-10-17_full_S1_R0.xml index 8752de1af..0df8bb82f 100644 --- a/core/src/test/resources/google/registry/tools/server/xn--q9jyb4c_2010-10-17_full_S1_R0.xml +++ b/core/src/test/resources/google/registry/tools/server/xn--q9jyb4c_2010-10-17_full_S1_R0.xml @@ -119,6 +119,11 @@ https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt + https://www.registry.google/about/policies/domainabuse/ + + https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt https://www.registry.google/about/policies/domainabuse/ @@ -130,7 +135,7 @@ 0 1 2 - 2 + 3