Subversion Repositories vgwhois

Rev

Rev 7 | Blame | Compare with Previous | Last modification | View Log | RSS feed

#!/usr/bin/php
<?php

#
#  VGWhoIs (ViaThinkSoft Global WhoIs, a fork of generic Whois / gwhois)
#  Maintenance / Developer utilities
#
#  (c) 2012-2019 by Daniel Marschall, ViaThinkSoft <info@daniel-marschall.de>
#
#  License: https://www.gnu.org/licenses/gpl-2.0.html (GPL version 2)
#

# TODO: ":notice||Whois server unknown (2014-03-15)" in pattern_newgtld automatisch umwandeln in :whois| wenn etwas bei IANA vorliegt.

error_reporting(E_ALL | E_NOTICE | E_STRICT | E_DEPRECATED);

require_once __DIR__ . '/--/config.inc.phps';
require_once __DIR__ . '/../../shared/php_includes/common_functions.inc.phps';
require_once __DIR__ . '/../../shared/php_includes/idna_convert.class.php';

# ---

$iana_tld_data = null;

// Step 1:
// Check if in the meantime TLDs without an official whois server were updated to have one
// Attention/TODO: A change of the whois name still needs manual intervention!

$newgtld_cont_original = file_get_contents(NEWGTLD_PATTERN_FILE);

$newgtld_cont_new = preg_replace_callback(
        '@# TODO: Entry generated automatically\. Needs manual check\.\n:notice\|\|Whois server unknown \((.*)\)\n\.(.*)\$\n@imU',
        function ($treffer) {
                $in_all = $treffer[0];
                $in_ts  = $treffer[1];
                $in_tld = $treffer[2];

                $days_passed = (time()-strtotime($treffer[1]))/(60*60*24);
                if ($days_passed < NEWGTLD_RECHECK_MISSING_WHOIS_SERVERS) {
                        return $in_all; // leave everything unchanged
                }

                $whois_serv = find_rootzone_whois_server($in_tld);

                if (!$whois_serv) {
                        // Nothing found. Just update last check date.
                        return str_replace($in_ts, date('Y-m-d'), $in_all);
                } else {
                        // Update the entry
                        return ":whois|$whois_serv\n.${in_tld}\$\n";
                }
        },
        $newgtld_cont_original
);

if ($newgtld_cont_original != $newgtld_cont_new) {
        file_put_contents(NEWGTLD_PATTERN_FILE, $newgtld_cont_new);
        gwi_update_newgtld_patternfile();
}

// Step 2:
// Search for new gTLDs which are not in our pattern file

$newgtld_data = gwi_newgtld_get_all_delegated_strings();
foreach ($newgtld_data as $data) {
        $date        = $data[0];
        $string      = $data[1];
        $tld         = $data[2];
        $tld_uc      = $data[3];
        $explanation = $data[4];

        // Is it already in our pattern file?
        if (does_exist($tld)) {
#               echo "Info: $tld is already in pattern.\n";
                continue;
        }

        // Only add the TLD to our pattern file if it has also been published by IANA.
        // The reason is that the newGTLD page had temporary typos many times (e.g. calogne instead of cologne)
        if (!isset($iana_tld_data)) {
                $iana_tld_data = explode("\n", cached_file(IANA_TLD_REGISTRY, CACHE_FILE_DIR));
                $iana_tld_data = array_map('trim', $iana_tld_data);
        }
        if (!in_array(strtoupper($tld), $iana_tld_data)) {
                echo "Info: Will not add $tld, since it is not yet added in IANA's registry.\n";
                continue;
        }

        $to_append  = "\n";
        $to_append .= "# Delegated on $date\n";
        if ($explanation) {
                $to_append .= "# $explanation\n";
        }

        $whois_serv = find_rootzone_whois_server($tld);

        if ($whois_serv) {
                $to_append .= ":whois|$whois_serv\n";
        } else {
                $to_append .= "# TODO: Entry generated automatically. Needs manual check.\n";
                $to_append .= ":notice||Whois server unknown (".date('Y-m-d').")\n";
        }
        $to_append .= "\\.$tld$\n";

        file_put_contents(NEWGTLD_PATTERN_FILE, $to_append, FILE_APPEND);

        gwi_update_newgtld_patternfile();
}






// TODO:
// Step 3: Check if there are IANA TLDs which are not in our pattern files yet
if (!isset($iana_tld_data)) {
        $iana_tld_data = explode("\n", cached_file(IANA_TLD_REGISTRY, CACHE_FILE_DIR));
        $iana_tld_data = array_map('trim', $iana_tld_data);
}
foreach ($iana_tld_data as $tld) {

        if (does_exist($tld)) { # TODO: in allen pattern files schauen
                continue;
        }


# echo "Does not exist: $tld\n";
continue;

        $newgtld_res = count_newgtld_applications($tld);
        if ($newgtld_res === false) {
                // TODO: ignore?
        } else if ($newgtld_res > 0) {
                // TODO: add to newgtld pattern file
        } else {
                // TODO: add to normal pattern file
        }


}



# ------------------------------------------------------

function gwi_update_newgtld_patternfile() {
        $now = date('Ymd');

        $pcont_original = file_get_contents(NEWGTLD_PATTERN_FILE);
        $pcont = $pcont_original;

        $count = 0;
        $pcont = preg_replace("@#: version (\\S+)@i", "#: version $now", $pcont, 1, $count);

        if ($count == 0) {
                // Add header
                $pcont = "#: version $now\n".
                         "# New gTLD\n".
                         "# see: http://newgtlds.icann.org/en/program-status/delegated-strings\n".
                         "# This file can be updated by running "vgwhois-pattern-update", but it does only ADD new \"New gTLDs\"\n".
                         "# --------------------------------------------------------------------\n".
                         "\n".$pcont;
        }

        if ($pcont != $pcont_original) {
                file_put_contents(NEWGTLD_PATTERN_FILE, $pcont);
        }
}

function find_rootzone_whois_server($tld) {
        $whois_serv = iana_get_rootzone_whois_server($tld);

        // Try to find "secret whois servers"
        if (TRY_FINDING_HIDDEN_WHOIS_SERVERS) {
                // TODO: also try out to use the URL of the homepage (in IANAs root DB)
                if (!$whois_serv) {
                        $check_server = "whois.nic.$tld";
                        if (gwitc_is_port_open($check_server, 43)) {
                                $whois_serv = $check_server;
                        }
                }
                if (!$whois_serv) {
                        $check_server = "whois.$tld";
                        if (gwitc_is_port_open($check_server, 43)) {
                                $whois_serv = $check_server;
                        }
                }
                if (!$whois_serv) {
                        $check_server = "$tld";
                        if (gwitc_is_port_open($check_server, 43)) {
                                $whois_serv = $check_server;
                        }
                }
        }

        return $whois_serv;
}

function iana_get_rootzone_whois_server($tld) {
        $tld  = strtolower($tld);
        $cont = QueryWhoisServer('whois.iana.org', $tld);
        if (!preg_match('@whois:\\s*(\\S+)@i', $cont, $m)) return false;
        return $m[1];
}

function does_exist($tld) {
        $cont = file_get_contents(NEWGTLD_PATTERN_FILE);
        $tld  = strtolower($tld);
        $cont = strtolower($cont);
        return (strpos($cont, "\n\\.$tld\$\n") !== false);
}

function gwi_newgtld_get_all_delegated_strings() {
        $cont = file_get_contents('http://newgtlds.icann.org/en/program-status/delegated-strings');

        // Convert Unicode stuff
        $cont = str_replace('xn'.unichr(0x2013), 'xn--', $cont);
        $cont = str_replace('xn'.unichr(0x2015), 'xn--', $cont); // used in Samsung TLD
        $cont = str_replace(unichr(0x2013), '-', $cont); // used in most explanations
        $cont = str_replace(unichr(0x2015), '-', $cont);
        $cont = str_replace(unichr(0x00fc), 'ue', $cont); // German umlaut ue (used in .koeln)
        $cont = utf8_decode($cont);

        // Do some minor corrections
        $cont = str_replace('game (s)', 'game(s)', $cont);

        $cont = explode('STRING</th>', $cont, 2);
        $cont = $cont[1];

        preg_match_all('@<tr>\s*<td[^>]*>(.*)</td>\s*<td[^>]*>(.*)</td>\s*</tr>@ismU', $cont, $m, PREG_SET_ORDER);

        $m = array_reverse($m);

        $out = array();
        foreach ($m as $data) {
                $date     = html_entity_decode(strip_tags($data[1]));
                $string   = html_entity_decode(strip_tags($data[2]));

                $string = str_replace('(', ' (', $string);
                while (strpos($string, '  ') !== false) $string = str_replace('  ', ' ', $string);

                # Fixing some misplaced white spaces
                $string = preg_replace('@\.\s+@m', '.', $string);
                $string = preg_replace('@\(\s+@m', '(', $string);
                $string = preg_replace('@\s+\)@m', ')', $string);

                $ary = explode(' ', $string, 3);
                if (count($ary) > 1) {
                        $tld_uc      = trim($ary[0]); // Unicode TLD

                        $tld         = trim($ary[1]); // Punycode TLD
                        $tld         = substr($tld, 1, strlen($tld)-2);

                        $explanation = trim($ary[2]);
                        $explanation = substr($explanation, 1); // remove "-"
                        $explanation = trim($explanation);
                } else {
                        $tld         = trim(strtolower($string));
                        $tld_uc      = $tld;
                        $explanation = '';
                }

                $out[] = array($date, $string, $tld, $tld_uc, $explanation);
        }

        return $out;
}

/**
 * Return unicode char by its code
 *
 * @param int $u
 * @return char
 * @source http://www.php.net/manual/de/function.chr.php#88611
 */
function unichr($u) {
        return mb_convert_encoding('&#' . intval($u) . ';', 'UTF-8', 'HTML-ENTITIES');
}

function count_newgtld_applications($string) {
        #global $punycoder;

        #if (is_null($punycoder))
        $punycoder = new idna_convert();
        $ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0';

        if (($x = $punycoder->decode(strtolower($string))) !== false) $string = $x;

        $out = array();
        exec('curl -i -s https://gtldresult.icann.org/application-result/applicationstatus/viewstatus -H "Host: gtldresult.icann.org" -H '.escapeshellarg('User-Agent: '.$ua), $out, $code);
        if ($code != 0) return false;
        $html = implode("\n", $out);

        if (!preg_match('@JSESSIONID=(.+);@ismU', $html, $m)) return false;
        $jsessionid = $m[1];

        if (!preg_match('@<input value="([^"]+)" name="t:formdata"@ismU', $html, $m)) return false;
        $formdata = $m[1];
        $formdata = str_replace('+', '%2B', $formdata);
        $formdata = str_replace('/', '%2F', $formdata);

        $out = array();
        exec('curl -s "https://gtldresult.icann.org/application-result/applicationstatus/viewstatus.applicationstatusform" -H "Host: gtldresult.icann.org" -H '.escapeshellarg('User-Agent: '.$ua).' -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" -H "Accept-Language: de,en-US;q=0.7,en;q=0.3"  -H "Referer: https://gtldresult.icann.org/application-result/applicationstatus/viewstatus" -H '.escapeshellarg('Cookie: JSESSIONID='.$jsessionid).' -H "Connection: keep-alive" --data '.escapeshellarg('t%3Aformdata='.$formdata.'&t%3Asubmit=%5B%22searchButton%22%2C%22searchButton%22%5D&searchField='.$string.'&searchButton=Search&status=&updates=&objections=&gacew=&similar=&pic='), $out, $code);
        if ($code != 0) return false;

        $out = array();
        exec('curl -s "https://gtldresult.icann.org/application-result/applicationstatus/viewstatus" -H "Host: gtldresult.icann.org" -H '.escapeshellarg('User-Agent: '.$ua).' -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" -H "Accept-Language: de,en-US;q=0.7,en;q=0.3"  -H "Referer: https://gtldresult.icann.org/application-result/applicationstatus/viewstatus" -H '.escapeshellarg('Cookie: JSESSIONID='.$jsessionid).' -H "DNT: 1" -H "Connection: keep-alive"', $out, $code);
        if ($code != 0) return false;

        $html = implode("\n", $out);

        preg_match_all('@href="/application-result/applicationstatus/viewstatus:viewapplicationdetails/(\d+)">'.preg_quote($string, '@').'</a>@ismU', $html, $m);

        # DEBUG
        echo "$string = ".count($m[1])."\n";

        # return (count($m[1]) > 0);
        return count($m[1]);
}

#assert(count_newgtld_applications('shopping') == 2);