Add a new Unconfusable Latin table (#1981)

This new table has just been approved by ICANN. It is the same as our existing
Extended Latin table, except with the removal of some lesser-used characters
with diacritic marks that are confusable variants.

The filenames for the IDN tables are made explicit to improve code readability.

And this reverses the removal of G with stroke from the existing Extended Latin
table (see PR #1938), so that that table continues to accurately reflect the
state of our previously launched TLDs.

This is the full list of removed characters:

U+00E1                         # LATIN SMALL LETTER A WITH ACUTE
U+0101                         # LATIN SMALL LETTER A WITH MACRON
U+01CE                         # LATIN SMALL LETTER A WITH CARON
U+010B                         # LATIN SMALL LETTER C WITH DOT ABOVE
U+01E7                         # LATIN SMALL LETTER G WITH CARON
U+0123                         # LATIN SMALL LETTER G WITH CEDILLA
U+01E5                         # LATIN SMALL LETTER G WITH STROKE
U+0131                         # LATIN SMALL LETTER DOTLESS I
U+00ED                         # LATIN SMALL LETTER I WITH ACUTE
U+00EF                         # LATIN SMALL LETTER I WITH DIAERESIS
U+01D0                         # LATIN SMALL LETTER I WITH CARON
U+0144                         # LATIN SMALL LETTER N WITH ACUTE
U+014B                         # LATIN SMALL LETTER ENG
U+00F3                         # LATIN SMALL LETTER O WITH ACUTE
U+014D                         # LATIN SMALL LETTER O WITH MACRON
U+01D2                         # LATIN SMALL LETTER O WITH CARON
U+0157                         # LATIN SMALL LETTER R WITH CEDILLA
U+0163                         # LATIN SMALL LETTER T WITH CEDILLA
U+00FA                         # LATIN SMALL LETTER U WITH ACUTE
U+00FC                         # LATIN SMALL LETTER U WITH DIAERESIS
U+01D4                         # LATIN SMALL LETTER U WITH CARON
U+1E83                         # LATIN SMALL LETTER W WITH ACUTE
U+1E81                         # LATIN SMALL LETTER W WITH GRAVE
U+1E85                         # LATIN SMALL LETTER W WITH DIAERESIS
U+1EF3                         # LATIN SMALL LETTER Y WITH GRAVE
U+017C                         # LATIN SMALL LETTER Z WITH DOT ABOVE
This commit is contained in:
Ben McIlwain 2023-04-06 15:49:36 -04:00 committed by GitHub
parent 9f65624f3e
commit 59681640fa
10 changed files with 178 additions and 12 deletions

View file

@ -68,6 +68,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE
U+01E7 # LATIN SMALL LETTER G WITH CARON U+01E7 # LATIN SMALL LETTER G WITH CARON
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
U+0123 # LATIN SMALL LETTER G WITH CEDILLA U+0123 # LATIN SMALL LETTER G WITH CEDILLA
U+01E5 # LATIN SMALL LETTER G WITH STROKE
U+0068 # LATIN SMALL LETTER H U+0068 # LATIN SMALL LETTER H
U+0127 # LATIN SMALL LETTER H WITH STROKE U+0127 # LATIN SMALL LETTER H WITH STROKE
U+0069 # LATIN SMALL LETTER I U+0069 # LATIN SMALL LETTER I

View file

@ -24,23 +24,48 @@ import java.net.URL;
/** Wrapper enum that loads all {@link IdnTable} resources into memory. */ /** Wrapper enum that loads all {@link IdnTable} resources into memory. */
public enum IdnTableEnum { public enum IdnTableEnum {
EXTENDED_LATIN,
JA; /**
* Extended Latin, as used on our existing TLD launches prior to 2023.
*
* <p>As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so
* it is retained solely for legacy compatibility with already-launched TLDs.
*/
EXTENDED_LATIN("extended_latin.txt"),
/**
* Extended Latin, but with confusable characters removed.
*
* <p>This is compatible with ICANN's requirements as of 2023, and is used for the Dads and Grads
* TLDs and all subsequent TLD launches. Note that confusable characters consist of various
* letters with diacritic marks on them, e.g. U+00EF (LATIN SMALL LETTER I WITH DIAERESIS) is not
* allowed because it is confusable with the standard i.
*/
UNCONFUSABLE_LATIN("unconfusable_latin.txt"),
/**
* Japanese, as used on our existing TLD launches prior to 2023.
*
* <p>As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so
* it is retained solely for legacy compatibility with already-launched TLDs.
*/
JA("japanese.txt");
private final IdnTable table; private final IdnTable table;
IdnTableEnum() { IdnTableEnum(String filename) {
this.table = load(Ascii.toLowerCase(name())); this.table = load(Ascii.toLowerCase(name()), filename);
} }
public IdnTable getTable() { public IdnTable getTable() {
return table; return table;
} }
private static IdnTable load(String name) { private static IdnTable load(String tableName, String filename) {
try { try {
URL resource = Resources.getResource(IdnTableEnum.class, name + ".txt"); URL resource = Resources.getResource(IdnTableEnum.class, filename);
return IdnTable.createFrom(name, readLines(resource, UTF_8), LanguageValidator.get(name)); return IdnTable.createFrom(
tableName, readLines(resource, UTF_8), LanguageValidator.get(tableName));
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); // should never happen throw new RuntimeException(e); // should never happen
} }

View file

@ -49,6 +49,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE
U+01E7 # LATIN SMALL LETTER G WITH CARON U+01E7 # LATIN SMALL LETTER G WITH CARON
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
U+0123 # LATIN SMALL LETTER G WITH CEDILLA U+0123 # LATIN SMALL LETTER G WITH CEDILLA
U+01E5 # LATIN SMALL LETTER G WITH STROKE
U+0068 # LATIN SMALL LETTER H U+0068 # LATIN SMALL LETTER H
U+0127 # LATIN SMALL LETTER H WITH STROKE U+0127 # LATIN SMALL LETTER H WITH STROKE
U+0069 # LATIN SMALL LETTER I U+0069 # LATIN SMALL LETTER I

View file

@ -0,0 +1,124 @@
# Registry: Charleston Road Registry Inc.
# Script: Latn
# Version: 2.0
# Effective Date: 2023-04-04
# URL: https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt
# Policy: https://www.registry.google/about/policies/domainabuse/
# Contact Name: CRR Tech
# Email address: crr-tech@google.com
# Telephone: +1 (650) 253-0000
#
# Code points requiring context rules
#
# Code point Description of rule/Reference
#
# U+002D Label must neither start nor end with U+002D. Label
# HYPHEN-MINUS must not have U+002D in both third and fourth
# position. RFC 5891 (sec 4.2.3.1)
#
U+002D # HYPHEN-MINUS
U+0030 # DIGIT ZERO
U+0031 # DIGIT ONE
U+0032 # DIGIT TWO
U+0033 # DIGIT THREE
U+0034 # DIGIT FOUR
U+0035 # DIGIT FIVE
U+0036 # DIGIT SIX
U+0037 # DIGIT SEVEN
U+0038 # DIGIT EIGHT
U+0039 # DIGIT NINE
U+0061 # LATIN SMALL LETTER A
U+00E0 # LATIN SMALL LETTER A WITH GRAVE
U+0103 # LATIN SMALL LETTER A WITH BREVE
U+00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX
U+00E5 # LATIN SMALL LETTER A WITH RING ABOVE
U+00E4 # LATIN SMALL LETTER A WITH DIAERESIS
U+00E3 # LATIN SMALL LETTER A WITH TILDE
U+0105 # LATIN SMALL LETTER A WITH OGONEK
U+00E6 # LATIN SMALL LETTER AE
U+0062 # LATIN SMALL LETTER B
U+0063 # LATIN SMALL LETTER C
U+0107 # LATIN SMALL LETTER C WITH ACUTE
U+010D # LATIN SMALL LETTER C WITH CARON
U+00E7 # LATIN SMALL LETTER C WITH CEDILLA
U+0064 # LATIN SMALL LETTER D
U+010F # LATIN SMALL LETTER D WITH CARON
U+0111 # LATIN SMALL LETTER D WITH STROKE
U+00F0 # LATIN SMALL LETTER ETH
U+0065 # LATIN SMALL LETTER E
U+00E9 # LATIN SMALL LETTER E WITH ACUTE
U+00E8 # LATIN SMALL LETTER E WITH GRAVE
U+00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX
U+011B # LATIN SMALL LETTER E WITH CARON
U+00EB # LATIN SMALL LETTER E WITH DIAERESIS
U+0119 # LATIN SMALL LETTER E WITH OGONEK
U+0113 # LATIN SMALL LETTER E WITH MACRON
U+0117 # LATIN SMALL LETTER E WITH DOT ABOVE
U+0259 # LATIN SMALL LETTER SCHWA
U+0066 # LATIN SMALL LETTER F
U+0067 # LATIN SMALL LETTER G
U+011F # LATIN SMALL LETTER G WITH BREVE
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
U+0068 # LATIN SMALL LETTER H
U+0127 # LATIN SMALL LETTER H WITH STROKE
U+0069 # LATIN SMALL LETTER I
U+00EC # LATIN SMALL LETTER I WITH GRAVE
U+00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX
U+012F # LATIN SMALL LETTER I WITH OGONEK
U+012B # LATIN SMALL LETTER I WITH MACRON
U+006A # LATIN SMALL LETTER J
U+006B # LATIN SMALL LETTER K
U+01E9 # LATIN SMALL LETTER K WITH CARON
U+0137 # LATIN SMALL LETTER K WITH CEDILLA
U+006C # LATIN SMALL LETTER L
U+013A # LATIN SMALL LETTER L WITH ACUTE
U+013E # LATIN SMALL LETTER L WITH CARON
U+013C # LATIN SMALL LETTER L WITH CEDILLA
U+0142 # LATIN SMALL LETTER L WITH STROKE
U+006D # LATIN SMALL LETTER M
U+006E # LATIN SMALL LETTER N
U+0148 # LATIN SMALL LETTER N WITH CARON
U+00F1 # LATIN SMALL LETTER N WITH TILDE
U+0146 # LATIN SMALL LETTER N WITH CEDILLA
U+006F # LATIN SMALL LETTER O
U+00F2 # LATIN SMALL LETTER O WITH GRAVE
U+00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX
U+00F6 # LATIN SMALL LETTER O WITH DIAERESIS
U+0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE
U+00F5 # LATIN SMALL LETTER O WITH TILDE
U+00F8 # LATIN SMALL LETTER O WITH STROKE
U+0153 # LATIN SMALL LIGATURE OE
U+0070 # LATIN SMALL LETTER P
U+0071 # LATIN SMALL LETTER Q
U+0072 # LATIN SMALL LETTER R
U+0155 # LATIN SMALL LETTER R WITH ACUTE
U+0159 # LATIN SMALL LETTER R WITH CARON
U+0073 # LATIN SMALL LETTER S
U+015B # LATIN SMALL LETTER S WITH ACUTE
U+0161 # LATIN SMALL LETTER S WITH CARON
U+015F # LATIN SMALL LETTER S WITH CEDILLA
U+0074 # LATIN SMALL LETTER T
U+0165 # LATIN SMALL LETTER T WITH CARON
U+0167 # LATIN SMALL LETTER T WITH STROKE
U+0075 # LATIN SMALL LETTER U
U+00F9 # LATIN SMALL LETTER U WITH GRAVE
U+00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX
U+016F # LATIN SMALL LETTER U WITH RING ABOVE
U+0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE
U+0173 # LATIN SMALL LETTER U WITH OGONEK
U+016B # LATIN SMALL LETTER U WITH MACRON
U+0076 # LATIN SMALL LETTER V
U+0077 # LATIN SMALL LETTER W
U+0175 # LATIN SMALL LETTER W WITH CIRCUMFLEX
U+0078 # LATIN SMALL LETTER X
U+0079 # LATIN SMALL LETTER Y
U+00FD # LATIN SMALL LETTER Y WITH ACUTE
U+0177 # LATIN SMALL LETTER Y WITH CIRCUMFLEX
U+00FF # LATIN SMALL LETTER Y WITH DIAERESIS
U+007A # LATIN SMALL LETTER Z
U+017A # LATIN SMALL LETTER Z WITH ACUTE
U+017E # LATIN SMALL LETTER Z WITH CARON
U+0292 # LATIN SMALL LETTER EZH
U+01EF # LATIN SMALL LETTER EZH WITH CARON
U+00FE # LATIN SMALL LETTER THORN

View file

@ -21,6 +21,11 @@
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef> </rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="unconfusable_latin">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="ja"> <rdeIDN:idnTableRef id="ja">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url> <rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
@ -32,7 +37,7 @@
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
</rdeHeader:header> </rdeHeader:header>
</rde:contents> </rde:contents>

View file

@ -14,6 +14,6 @@
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
</rdeHeader:header> </rdeHeader:header>
</rdeReport:report> </rdeReport:report>

View file

@ -245,6 +245,11 @@
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef> </rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="unconfusable_latin">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="ja"> <rdeIDN:idnTableRef id="ja">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url> <rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
@ -256,7 +261,7 @@
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
</rdeHeader:header> </rdeHeader:header>
</rde:contents> </rde:contents>

View file

@ -37,6 +37,6 @@
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
</rdeHeader:header> </rdeHeader:header>
</rdeReport:report> </rdeReport:report>

View file

@ -119,6 +119,11 @@
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef> </rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="unconfusable_latin">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
</rdeIDN:idnTableRef>
<rdeIDN:idnTableRef id="ja"> <rdeIDN:idnTableRef id="ja">
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url> <rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy> <rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
@ -130,7 +135,7 @@
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeContact-1.0">0</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeContact-1.0">0</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count> <rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
</rdeHeader:header> </rdeHeader:header>
</rde:contents> </rde:contents>