diff --git a/core/src/main/java/google/registry/idn/Latin-IDN.txt b/core/src/main/java/google/registry/idn/Latin-IDN.txt index 70c0b1de9..5b240ba3c 100644 --- a/core/src/main/java/google/registry/idn/Latin-IDN.txt +++ b/core/src/main/java/google/registry/idn/Latin-IDN.txt @@ -68,6 +68,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE U+01E7 # LATIN SMALL LETTER G WITH CARON U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE U+0123 # LATIN SMALL LETTER G WITH CEDILLA +U+01E5 # LATIN SMALL LETTER G WITH STROKE U+0068 # LATIN SMALL LETTER H U+0127 # LATIN SMALL LETTER H WITH STROKE U+0069 # LATIN SMALL LETTER I diff --git a/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java b/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java index f27208ade..cba8b1865 100644 --- a/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java +++ b/core/src/main/java/google/registry/tldconfig/idn/IdnTableEnum.java @@ -24,23 +24,48 @@ import java.net.URL; /** Wrapper enum that loads all {@link IdnTable} resources into memory. */ public enum IdnTableEnum { - EXTENDED_LATIN, - JA; + + /** + * Extended Latin, as used on our existing TLD launches prior to 2023. + * + *
As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so + * it is retained solely for legacy compatibility with already-launched TLDs. + */ + EXTENDED_LATIN("extended_latin.txt"), + + /** + * Extended Latin, but with confusable characters removed. + * + *
This is compatible with ICANN's requirements as of 2023, and is used for the Dads and Grads + * TLDs and all subsequent TLD launches. Note that confusable characters consist of various + * letters with diacritic marks on them, e.g. U+00EF (LATIN SMALL LETTER I WITH DIAERESIS) is not + * allowed because it is confusable with the standard i. + */ + UNCONFUSABLE_LATIN("unconfusable_latin.txt"), + + /** + * Japanese, as used on our existing TLD launches prior to 2023. + * + *
As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so
+ * it is retained solely for legacy compatibility with already-launched TLDs.
+ */
+ JA("japanese.txt");
private final IdnTable table;
- IdnTableEnum() {
- this.table = load(Ascii.toLowerCase(name()));
+ IdnTableEnum(String filename) {
+ this.table = load(Ascii.toLowerCase(name()), filename);
}
public IdnTable getTable() {
return table;
}
- private static IdnTable load(String name) {
+ private static IdnTable load(String tableName, String filename) {
try {
- URL resource = Resources.getResource(IdnTableEnum.class, name + ".txt");
- return IdnTable.createFrom(name, readLines(resource, UTF_8), LanguageValidator.get(name));
+ URL resource = Resources.getResource(IdnTableEnum.class, filename);
+ return IdnTable.createFrom(
+ tableName, readLines(resource, UTF_8), LanguageValidator.get(tableName));
} catch (IOException e) {
throw new RuntimeException(e); // should never happen
}
diff --git a/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt b/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt
index 2bd2d765f..ea035c6d1 100644
--- a/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt
+++ b/core/src/main/java/google/registry/tldconfig/idn/extended_latin.txt
@@ -49,6 +49,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE
U+01E7 # LATIN SMALL LETTER G WITH CARON
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
U+0123 # LATIN SMALL LETTER G WITH CEDILLA
+U+01E5 # LATIN SMALL LETTER G WITH STROKE
U+0068 # LATIN SMALL LETTER H
U+0127 # LATIN SMALL LETTER H WITH STROKE
U+0069 # LATIN SMALL LETTER I
diff --git a/core/src/main/java/google/registry/tldconfig/idn/ja.txt b/core/src/main/java/google/registry/tldconfig/idn/japanese.txt
similarity index 100%
rename from core/src/main/java/google/registry/tldconfig/idn/ja.txt
rename to core/src/main/java/google/registry/tldconfig/idn/japanese.txt
diff --git a/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt b/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt
new file mode 100644
index 000000000..4f3443405
--- /dev/null
+++ b/core/src/main/java/google/registry/tldconfig/idn/unconfusable_latin.txt
@@ -0,0 +1,124 @@
+# Registry: Charleston Road Registry Inc.
+# Script: Latn
+# Version: 2.0
+# Effective Date: 2023-04-04
+# URL: https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt
+# Policy: https://www.registry.google/about/policies/domainabuse/
+# Contact Name: CRR Tech
+# Email address: crr-tech@google.com
+# Telephone: +1 (650) 253-0000
+#
+# Code points requiring context rules
+#
+# Code point Description of rule/Reference
+#
+# U+002D Label must neither start nor end with U+002D. Label
+# HYPHEN-MINUS must not have U+002D in both third and fourth
+# position. RFC 5891 (sec 4.2.3.1)
+#
+
+U+002D # HYPHEN-MINUS
+U+0030 # DIGIT ZERO
+U+0031 # DIGIT ONE
+U+0032 # DIGIT TWO
+U+0033 # DIGIT THREE
+U+0034 # DIGIT FOUR
+U+0035 # DIGIT FIVE
+U+0036 # DIGIT SIX
+U+0037 # DIGIT SEVEN
+U+0038 # DIGIT EIGHT
+U+0039 # DIGIT NINE
+U+0061 # LATIN SMALL LETTER A
+U+00E0 # LATIN SMALL LETTER A WITH GRAVE
+U+0103 # LATIN SMALL LETTER A WITH BREVE
+U+00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX
+U+00E5 # LATIN SMALL LETTER A WITH RING ABOVE
+U+00E4 # LATIN SMALL LETTER A WITH DIAERESIS
+U+00E3 # LATIN SMALL LETTER A WITH TILDE
+U+0105 # LATIN SMALL LETTER A WITH OGONEK
+U+00E6 # LATIN SMALL LETTER AE
+U+0062 # LATIN SMALL LETTER B
+U+0063 # LATIN SMALL LETTER C
+U+0107 # LATIN SMALL LETTER C WITH ACUTE
+U+010D # LATIN SMALL LETTER C WITH CARON
+U+00E7 # LATIN SMALL LETTER C WITH CEDILLA
+U+0064 # LATIN SMALL LETTER D
+U+010F # LATIN SMALL LETTER D WITH CARON
+U+0111 # LATIN SMALL LETTER D WITH STROKE
+U+00F0 # LATIN SMALL LETTER ETH
+U+0065 # LATIN SMALL LETTER E
+U+00E9 # LATIN SMALL LETTER E WITH ACUTE
+U+00E8 # LATIN SMALL LETTER E WITH GRAVE
+U+00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX
+U+011B # LATIN SMALL LETTER E WITH CARON
+U+00EB # LATIN SMALL LETTER E WITH DIAERESIS
+U+0119 # LATIN SMALL LETTER E WITH OGONEK
+U+0113 # LATIN SMALL LETTER E WITH MACRON
+U+0117 # LATIN SMALL LETTER E WITH DOT ABOVE
+U+0259 # LATIN SMALL LETTER SCHWA
+U+0066 # LATIN SMALL LETTER F
+U+0067 # LATIN SMALL LETTER G
+U+011F # LATIN SMALL LETTER G WITH BREVE
+U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
+U+0068 # LATIN SMALL LETTER H
+U+0127 # LATIN SMALL LETTER H WITH STROKE
+U+0069 # LATIN SMALL LETTER I
+U+00EC # LATIN SMALL LETTER I WITH GRAVE
+U+00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX
+U+012F # LATIN SMALL LETTER I WITH OGONEK
+U+012B # LATIN SMALL LETTER I WITH MACRON
+U+006A # LATIN SMALL LETTER J
+U+006B # LATIN SMALL LETTER K
+U+01E9 # LATIN SMALL LETTER K WITH CARON
+U+0137 # LATIN SMALL LETTER K WITH CEDILLA
+U+006C # LATIN SMALL LETTER L
+U+013A # LATIN SMALL LETTER L WITH ACUTE
+U+013E # LATIN SMALL LETTER L WITH CARON
+U+013C # LATIN SMALL LETTER L WITH CEDILLA
+U+0142 # LATIN SMALL LETTER L WITH STROKE
+U+006D # LATIN SMALL LETTER M
+U+006E # LATIN SMALL LETTER N
+U+0148 # LATIN SMALL LETTER N WITH CARON
+U+00F1 # LATIN SMALL LETTER N WITH TILDE
+U+0146 # LATIN SMALL LETTER N WITH CEDILLA
+U+006F # LATIN SMALL LETTER O
+U+00F2 # LATIN SMALL LETTER O WITH GRAVE
+U+00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX
+U+00F6 # LATIN SMALL LETTER O WITH DIAERESIS
+U+0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE
+U+00F5 # LATIN SMALL LETTER O WITH TILDE
+U+00F8 # LATIN SMALL LETTER O WITH STROKE
+U+0153 # LATIN SMALL LIGATURE OE
+U+0070 # LATIN SMALL LETTER P
+U+0071 # LATIN SMALL LETTER Q
+U+0072 # LATIN SMALL LETTER R
+U+0155 # LATIN SMALL LETTER R WITH ACUTE
+U+0159 # LATIN SMALL LETTER R WITH CARON
+U+0073 # LATIN SMALL LETTER S
+U+015B # LATIN SMALL LETTER S WITH ACUTE
+U+0161 # LATIN SMALL LETTER S WITH CARON
+U+015F # LATIN SMALL LETTER S WITH CEDILLA
+U+0074 # LATIN SMALL LETTER T
+U+0165 # LATIN SMALL LETTER T WITH CARON
+U+0167 # LATIN SMALL LETTER T WITH STROKE
+U+0075 # LATIN SMALL LETTER U
+U+00F9 # LATIN SMALL LETTER U WITH GRAVE
+U+00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX
+U+016F # LATIN SMALL LETTER U WITH RING ABOVE
+U+0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE
+U+0173 # LATIN SMALL LETTER U WITH OGONEK
+U+016B # LATIN SMALL LETTER U WITH MACRON
+U+0076 # LATIN SMALL LETTER V
+U+0077 # LATIN SMALL LETTER W
+U+0175 # LATIN SMALL LETTER W WITH CIRCUMFLEX
+U+0078 # LATIN SMALL LETTER X
+U+0079 # LATIN SMALL LETTER Y
+U+00FD # LATIN SMALL LETTER Y WITH ACUTE
+U+0177 # LATIN SMALL LETTER Y WITH CIRCUMFLEX
+U+00FF # LATIN SMALL LETTER Y WITH DIAERESIS
+U+007A # LATIN SMALL LETTER Z
+U+017A # LATIN SMALL LETTER Z WITH ACUTE
+U+017E # LATIN SMALL LETTER Z WITH CARON
+U+0292 # LATIN SMALL LETTER EZH
+U+01EF # LATIN SMALL LETTER EZH WITH CARON
+U+00FE # LATIN SMALL LETTER THORN
diff --git a/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml b/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml
index df14ad1e3..ffb8b40f1 100644
--- a/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml
+++ b/core/src/test/resources/google/registry/beam/rde/reducer_rde.xml
@@ -21,6 +21,11 @@