google-nomulus/java/google/registry/tldconfig/idn/JapaneseLanguageValidator.java

// Copyright 2016 The Domain Registry Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package google.registry.tldconfig.idn;

import static java.lang.Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION;
import static java.lang.Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS;
import static java.lang.Character.UnicodeBlock.HIRAGANA;
import static java.lang.Character.UnicodeBlock.KATAKANA;

import com.google.common.collect.ImmutableRangeSet;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Range;

import java.lang.Character.UnicodeBlock;

/**
 * Validates Japanese language domain labels. This class should only be used with a Japanese
 * language IDN table.
 */
class JapaneseLanguageValidator extends LanguageValidator {

  /** Any string with Japanese characters can have at most 15 characters. */
  private static final int MAX_LENGTH_JAPANESE_STRING = 15;

  /** Equals the codepoint for the character '〆'. */
  private static final int IDEOGRAPHIC_CLOSING_MARK = 0x3006;

  /** Equals the codepoint for the character '・'. */
  private static final int KATAKANA_MIDDLE_DOT = 0x30FB;

  /** Equals the codepoint for the character 'ー'. */
  private static final int KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK = 0x30FC;

  /** The set of {@link UnicodeBlock} objects containing valid Japanese codepoints. */
  private static final ImmutableSet<UnicodeBlock> JAPANESE_UNICODE_BLOCKS = ImmutableSet.of(
      CJK_SYMBOLS_AND_PUNCTUATION, HIRAGANA, KATAKANA, CJK_UNIFIED_IDEOGRAPHS);

  /**
   * Codepoints which are technically considered to be in the Japanese language, but are
   * "exceptions" in that they can not appear in a label with a KATAKANA MIDDLE DOT or
   * IDEOGRAPHIC_CLOSING_MARK unless other Japanese non-exception codepoints are also present.
   */
  private static final ImmutableRangeSet<Integer> JAPANESE_EXCEPTION_CODEPOINTS =
      new ImmutableRangeSet.Builder<Integer>()
      .add(Range.<Integer>singleton(IDEOGRAPHIC_CLOSING_MARK))
      .add(Range.<Integer>singleton(KATAKANA_MIDDLE_DOT))
      .add(Range.<Integer>singleton(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK))
      .build();

  @Override
  boolean isValidLabelForLanguage(String label) {
    boolean requiresJapaneseNonExceptionCodepoint = false;
    boolean hasJapaneseCodepoint = false;
    boolean hasJapaneseNonExceptionCodepoint = false;

    final int length = label.length();
    int codepoints = 0;
    UnicodeBlock precedingUnicodeBlock = null;
    for (int i = 0; i < length; ) {
      int codepoint = label.codePointAt(i);
      UnicodeBlock unicodeBlock = UnicodeBlock.of(codepoint);
      boolean isException = JAPANESE_EXCEPTION_CODEPOINTS.contains(codepoint);
      boolean isJapanese = JAPANESE_UNICODE_BLOCKS.contains(unicodeBlock);

      // A label containing KATAKANA_MIDDLE_DOT or IDEOGRAPHIC_CLOSING_MARK requires a Japanese
      // language codepoint to also appear in the label.
      if (codepoint == KATAKANA_MIDDLE_DOT || codepoint == IDEOGRAPHIC_CLOSING_MARK) {
        requiresJapaneseNonExceptionCodepoint = true;
      }

      // The KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK can only occur after a HIRAGANA or KATAKANA
      // character.
      if (codepoint == KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK
          && precedingUnicodeBlock != HIRAGANA && precedingUnicodeBlock != KATAKANA) {
        return false;
      }

      // If a codepoint is Japanese but not an "exception" codepoint, then it must a non-exception
      // Japanese codepoint.
      if (isJapanese && !isException) {
        hasJapaneseNonExceptionCodepoint = true;
      }

      // Make a note if we've seen any Japanese codepoint. Note that this object should really only
      // be used on a Japanese IDN table, and thus any non-ASCII codepoint should really be
      // Japanese. But we do the additional check again the characters UnicodeBlock just in case.
      if (isJapanese) {
        hasJapaneseCodepoint = true;
      }

      // Some codepoints take up more than one character in Java strings (e.g. high and low
      // surrogates).
      i += Character.charCount(codepoint);
      ++codepoints;
      precedingUnicodeBlock = unicodeBlock;
    }

    // A label with the KATAKANA MIDDLE DOT or IDEOGRAPHIC_CLOSING_MARK codepoint must also have
    // some Japanese character in the label. The Japanese "exception" characters do not count in
    // this regard.
    if (requiresJapaneseNonExceptionCodepoint && !hasJapaneseNonExceptionCodepoint) {
      return false;
    }

    // Any label with Japanese characters (including "exception" characters) can only be 15
    // codepoints long.
    return !(hasJapaneseCodepoint && (codepoints > MAX_LENGTH_JAPANESE_STRING));

  }
}