mirror of
https://github.com/google/nomulus.git
synced 2025-07-03 09:43:30 +02:00
Add a new Unconfusable Latin table (#1981)
This new table has just been approved by ICANN. It is the same as our existing Extended Latin table, except with the removal of some lesser-used characters with diacritic marks that are confusable variants. The filenames for the IDN tables are made explicit to improve code readability. And this reverses the removal of G with stroke from the existing Extended Latin table (see PR #1938), so that that table continues to accurately reflect the state of our previously launched TLDs. This is the full list of removed characters: U+00E1 # LATIN SMALL LETTER A WITH ACUTE U+0101 # LATIN SMALL LETTER A WITH MACRON U+01CE # LATIN SMALL LETTER A WITH CARON U+010B # LATIN SMALL LETTER C WITH DOT ABOVE U+01E7 # LATIN SMALL LETTER G WITH CARON U+0123 # LATIN SMALL LETTER G WITH CEDILLA U+01E5 # LATIN SMALL LETTER G WITH STROKE U+0131 # LATIN SMALL LETTER DOTLESS I U+00ED # LATIN SMALL LETTER I WITH ACUTE U+00EF # LATIN SMALL LETTER I WITH DIAERESIS U+01D0 # LATIN SMALL LETTER I WITH CARON U+0144 # LATIN SMALL LETTER N WITH ACUTE U+014B # LATIN SMALL LETTER ENG U+00F3 # LATIN SMALL LETTER O WITH ACUTE U+014D # LATIN SMALL LETTER O WITH MACRON U+01D2 # LATIN SMALL LETTER O WITH CARON U+0157 # LATIN SMALL LETTER R WITH CEDILLA U+0163 # LATIN SMALL LETTER T WITH CEDILLA U+00FA # LATIN SMALL LETTER U WITH ACUTE U+00FC # LATIN SMALL LETTER U WITH DIAERESIS U+01D4 # LATIN SMALL LETTER U WITH CARON U+1E83 # LATIN SMALL LETTER W WITH ACUTE U+1E81 # LATIN SMALL LETTER W WITH GRAVE U+1E85 # LATIN SMALL LETTER W WITH DIAERESIS U+1EF3 # LATIN SMALL LETTER Y WITH GRAVE U+017C # LATIN SMALL LETTER Z WITH DOT ABOVE
This commit is contained in:
parent
9f65624f3e
commit
59681640fa
10 changed files with 178 additions and 12 deletions
|
@ -68,6 +68,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE
|
|||
U+01E7 # LATIN SMALL LETTER G WITH CARON
|
||||
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
|
||||
U+0123 # LATIN SMALL LETTER G WITH CEDILLA
|
||||
U+01E5 # LATIN SMALL LETTER G WITH STROKE
|
||||
U+0068 # LATIN SMALL LETTER H
|
||||
U+0127 # LATIN SMALL LETTER H WITH STROKE
|
||||
U+0069 # LATIN SMALL LETTER I
|
||||
|
|
|
@ -24,23 +24,48 @@ import java.net.URL;
|
|||
|
||||
/** Wrapper enum that loads all {@link IdnTable} resources into memory. */
|
||||
public enum IdnTableEnum {
|
||||
EXTENDED_LATIN,
|
||||
JA;
|
||||
|
||||
/**
|
||||
* Extended Latin, as used on our existing TLD launches prior to 2023.
|
||||
*
|
||||
* <p>As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so
|
||||
* it is retained solely for legacy compatibility with already-launched TLDs.
|
||||
*/
|
||||
EXTENDED_LATIN("extended_latin.txt"),
|
||||
|
||||
/**
|
||||
* Extended Latin, but with confusable characters removed.
|
||||
*
|
||||
* <p>This is compatible with ICANN's requirements as of 2023, and is used for the Dads and Grads
|
||||
* TLDs and all subsequent TLD launches. Note that confusable characters consist of various
|
||||
* letters with diacritic marks on them, e.g. U+00EF (LATIN SMALL LETTER I WITH DIAERESIS) is not
|
||||
* allowed because it is confusable with the standard i.
|
||||
*/
|
||||
UNCONFUSABLE_LATIN("unconfusable_latin.txt"),
|
||||
|
||||
/**
|
||||
* Japanese, as used on our existing TLD launches prior to 2023.
|
||||
*
|
||||
* <p>As of 2023 this table is no longer conformant with ICANN's IDN policies for new launches, so
|
||||
* it is retained solely for legacy compatibility with already-launched TLDs.
|
||||
*/
|
||||
JA("japanese.txt");
|
||||
|
||||
private final IdnTable table;
|
||||
|
||||
IdnTableEnum() {
|
||||
this.table = load(Ascii.toLowerCase(name()));
|
||||
IdnTableEnum(String filename) {
|
||||
this.table = load(Ascii.toLowerCase(name()), filename);
|
||||
}
|
||||
|
||||
public IdnTable getTable() {
|
||||
return table;
|
||||
}
|
||||
|
||||
private static IdnTable load(String name) {
|
||||
private static IdnTable load(String tableName, String filename) {
|
||||
try {
|
||||
URL resource = Resources.getResource(IdnTableEnum.class, name + ".txt");
|
||||
return IdnTable.createFrom(name, readLines(resource, UTF_8), LanguageValidator.get(name));
|
||||
URL resource = Resources.getResource(IdnTableEnum.class, filename);
|
||||
return IdnTable.createFrom(
|
||||
tableName, readLines(resource, UTF_8), LanguageValidator.get(tableName));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e); // should never happen
|
||||
}
|
||||
|
|
|
@ -49,6 +49,7 @@ U+011F # LATIN SMALL LETTER G WITH BREVE
|
|||
U+01E7 # LATIN SMALL LETTER G WITH CARON
|
||||
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
|
||||
U+0123 # LATIN SMALL LETTER G WITH CEDILLA
|
||||
U+01E5 # LATIN SMALL LETTER G WITH STROKE
|
||||
U+0068 # LATIN SMALL LETTER H
|
||||
U+0127 # LATIN SMALL LETTER H WITH STROKE
|
||||
U+0069 # LATIN SMALL LETTER I
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
# Registry: Charleston Road Registry Inc.
|
||||
# Script: Latn
|
||||
# Version: 2.0
|
||||
# Effective Date: 2023-04-04
|
||||
# URL: https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt
|
||||
# Policy: https://www.registry.google/about/policies/domainabuse/
|
||||
# Contact Name: CRR Tech
|
||||
# Email address: crr-tech@google.com
|
||||
# Telephone: +1 (650) 253-0000
|
||||
#
|
||||
# Code points requiring context rules
|
||||
#
|
||||
# Code point Description of rule/Reference
|
||||
#
|
||||
# U+002D Label must neither start nor end with U+002D. Label
|
||||
# HYPHEN-MINUS must not have U+002D in both third and fourth
|
||||
# position. RFC 5891 (sec 4.2.3.1)
|
||||
#
|
||||
|
||||
U+002D # HYPHEN-MINUS
|
||||
U+0030 # DIGIT ZERO
|
||||
U+0031 # DIGIT ONE
|
||||
U+0032 # DIGIT TWO
|
||||
U+0033 # DIGIT THREE
|
||||
U+0034 # DIGIT FOUR
|
||||
U+0035 # DIGIT FIVE
|
||||
U+0036 # DIGIT SIX
|
||||
U+0037 # DIGIT SEVEN
|
||||
U+0038 # DIGIT EIGHT
|
||||
U+0039 # DIGIT NINE
|
||||
U+0061 # LATIN SMALL LETTER A
|
||||
U+00E0 # LATIN SMALL LETTER A WITH GRAVE
|
||||
U+0103 # LATIN SMALL LETTER A WITH BREVE
|
||||
U+00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX
|
||||
U+00E5 # LATIN SMALL LETTER A WITH RING ABOVE
|
||||
U+00E4 # LATIN SMALL LETTER A WITH DIAERESIS
|
||||
U+00E3 # LATIN SMALL LETTER A WITH TILDE
|
||||
U+0105 # LATIN SMALL LETTER A WITH OGONEK
|
||||
U+00E6 # LATIN SMALL LETTER AE
|
||||
U+0062 # LATIN SMALL LETTER B
|
||||
U+0063 # LATIN SMALL LETTER C
|
||||
U+0107 # LATIN SMALL LETTER C WITH ACUTE
|
||||
U+010D # LATIN SMALL LETTER C WITH CARON
|
||||
U+00E7 # LATIN SMALL LETTER C WITH CEDILLA
|
||||
U+0064 # LATIN SMALL LETTER D
|
||||
U+010F # LATIN SMALL LETTER D WITH CARON
|
||||
U+0111 # LATIN SMALL LETTER D WITH STROKE
|
||||
U+00F0 # LATIN SMALL LETTER ETH
|
||||
U+0065 # LATIN SMALL LETTER E
|
||||
U+00E9 # LATIN SMALL LETTER E WITH ACUTE
|
||||
U+00E8 # LATIN SMALL LETTER E WITH GRAVE
|
||||
U+00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX
|
||||
U+011B # LATIN SMALL LETTER E WITH CARON
|
||||
U+00EB # LATIN SMALL LETTER E WITH DIAERESIS
|
||||
U+0119 # LATIN SMALL LETTER E WITH OGONEK
|
||||
U+0113 # LATIN SMALL LETTER E WITH MACRON
|
||||
U+0117 # LATIN SMALL LETTER E WITH DOT ABOVE
|
||||
U+0259 # LATIN SMALL LETTER SCHWA
|
||||
U+0066 # LATIN SMALL LETTER F
|
||||
U+0067 # LATIN SMALL LETTER G
|
||||
U+011F # LATIN SMALL LETTER G WITH BREVE
|
||||
U+0121 # LATIN SMALL LETTER G WITH DOT ABOVE
|
||||
U+0068 # LATIN SMALL LETTER H
|
||||
U+0127 # LATIN SMALL LETTER H WITH STROKE
|
||||
U+0069 # LATIN SMALL LETTER I
|
||||
U+00EC # LATIN SMALL LETTER I WITH GRAVE
|
||||
U+00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX
|
||||
U+012F # LATIN SMALL LETTER I WITH OGONEK
|
||||
U+012B # LATIN SMALL LETTER I WITH MACRON
|
||||
U+006A # LATIN SMALL LETTER J
|
||||
U+006B # LATIN SMALL LETTER K
|
||||
U+01E9 # LATIN SMALL LETTER K WITH CARON
|
||||
U+0137 # LATIN SMALL LETTER K WITH CEDILLA
|
||||
U+006C # LATIN SMALL LETTER L
|
||||
U+013A # LATIN SMALL LETTER L WITH ACUTE
|
||||
U+013E # LATIN SMALL LETTER L WITH CARON
|
||||
U+013C # LATIN SMALL LETTER L WITH CEDILLA
|
||||
U+0142 # LATIN SMALL LETTER L WITH STROKE
|
||||
U+006D # LATIN SMALL LETTER M
|
||||
U+006E # LATIN SMALL LETTER N
|
||||
U+0148 # LATIN SMALL LETTER N WITH CARON
|
||||
U+00F1 # LATIN SMALL LETTER N WITH TILDE
|
||||
U+0146 # LATIN SMALL LETTER N WITH CEDILLA
|
||||
U+006F # LATIN SMALL LETTER O
|
||||
U+00F2 # LATIN SMALL LETTER O WITH GRAVE
|
||||
U+00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX
|
||||
U+00F6 # LATIN SMALL LETTER O WITH DIAERESIS
|
||||
U+0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE
|
||||
U+00F5 # LATIN SMALL LETTER O WITH TILDE
|
||||
U+00F8 # LATIN SMALL LETTER O WITH STROKE
|
||||
U+0153 # LATIN SMALL LIGATURE OE
|
||||
U+0070 # LATIN SMALL LETTER P
|
||||
U+0071 # LATIN SMALL LETTER Q
|
||||
U+0072 # LATIN SMALL LETTER R
|
||||
U+0155 # LATIN SMALL LETTER R WITH ACUTE
|
||||
U+0159 # LATIN SMALL LETTER R WITH CARON
|
||||
U+0073 # LATIN SMALL LETTER S
|
||||
U+015B # LATIN SMALL LETTER S WITH ACUTE
|
||||
U+0161 # LATIN SMALL LETTER S WITH CARON
|
||||
U+015F # LATIN SMALL LETTER S WITH CEDILLA
|
||||
U+0074 # LATIN SMALL LETTER T
|
||||
U+0165 # LATIN SMALL LETTER T WITH CARON
|
||||
U+0167 # LATIN SMALL LETTER T WITH STROKE
|
||||
U+0075 # LATIN SMALL LETTER U
|
||||
U+00F9 # LATIN SMALL LETTER U WITH GRAVE
|
||||
U+00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX
|
||||
U+016F # LATIN SMALL LETTER U WITH RING ABOVE
|
||||
U+0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE
|
||||
U+0173 # LATIN SMALL LETTER U WITH OGONEK
|
||||
U+016B # LATIN SMALL LETTER U WITH MACRON
|
||||
U+0076 # LATIN SMALL LETTER V
|
||||
U+0077 # LATIN SMALL LETTER W
|
||||
U+0175 # LATIN SMALL LETTER W WITH CIRCUMFLEX
|
||||
U+0078 # LATIN SMALL LETTER X
|
||||
U+0079 # LATIN SMALL LETTER Y
|
||||
U+00FD # LATIN SMALL LETTER Y WITH ACUTE
|
||||
U+0177 # LATIN SMALL LETTER Y WITH CIRCUMFLEX
|
||||
U+00FF # LATIN SMALL LETTER Y WITH DIAERESIS
|
||||
U+007A # LATIN SMALL LETTER Z
|
||||
U+017A # LATIN SMALL LETTER Z WITH ACUTE
|
||||
U+017E # LATIN SMALL LETTER Z WITH CARON
|
||||
U+0292 # LATIN SMALL LETTER EZH
|
||||
U+01EF # LATIN SMALL LETTER EZH WITH CARON
|
||||
U+00FE # LATIN SMALL LETTER THORN
|
|
@ -21,6 +21,11 @@
|
|||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="unconfusable_latin">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="ja">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
|
@ -32,7 +37,7 @@
|
|||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
|
||||
</rdeHeader:header>
|
||||
|
||||
</rde:contents>
|
||||
|
|
|
@ -14,6 +14,6 @@
|
|||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
|
||||
</rdeHeader:header>
|
||||
</rdeReport:report>
|
||||
|
|
|
@ -245,6 +245,11 @@
|
|||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="unconfusable_latin">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="ja">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
|
@ -256,7 +261,7 @@
|
|||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
|
||||
</rdeHeader:header>
|
||||
|
||||
</rde:contents>
|
||||
|
|
|
@ -37,6 +37,6 @@
|
|||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeDomain-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
|
||||
</rdeHeader:header>
|
||||
</rdeReport:report>
|
||||
|
|
|
@ -119,6 +119,11 @@
|
|||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="unconfusable_latin">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_latn_2.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
</rdeIDN:idnTableRef>
|
||||
|
||||
<rdeIDN:idnTableRef id="ja">
|
||||
<rdeIDN:url>https://www.iana.org/domains/idn-tables/tables/google_ja_1.0.txt</rdeIDN:url>
|
||||
<rdeIDN:urlPolicy>https://www.registry.google/about/policies/domainabuse/</rdeIDN:urlPolicy>
|
||||
|
@ -130,7 +135,7 @@
|
|||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeContact-1.0">0</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeHost-1.0">1</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeRegistrar-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">2</rdeHeader:count>
|
||||
<rdeHeader:count uri="urn:ietf:params:xml:ns:rdeIDN-1.0">3</rdeHeader:count>
|
||||
</rdeHeader:header>
|
||||
|
||||
</rde:contents>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue