Rev 17 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
12 | daniel-mar | 1 | # |
2 | # VGWhoIs (ViaThinkSoft Global WhoIs, a fork of generic Whois / gwhois) |
||
3 | # Main program |
||
4 | # |
||
92 | daniel-mar | 5 | # (c) 2010-2022 by Daniel Marschall, ViaThinkSoft <info@daniel-marschall.de> |
12 | daniel-mar | 6 | # based on the code (c) 1998-2010 by Juliane Holzt <debian@kju.de> |
7 | # Some early parts by Lutz Donnerhacke <Lutz.Donnerhacke@Jena.Thur.de> |
||
8 | # |
||
9 | # License: https://www.gnu.org/licenses/gpl-2.0.html (GPL version 2) |
||
10 | # |
||
11 | |||
12 | package VGWhoIs::Utils; |
||
13 | |||
14 | use warnings; |
||
15 | use strict; |
||
16 | |||
17 | use Encode; |
||
18 | |||
19 | # $result = VGWhoIs::Utils::lynxsource($url) |
||
20 | sub VGWhoIs::Utils::lynxsource { |
||
21 | my ($url) = @_; |
||
22 | $url = quotemeta($url); |
||
23 | # LYNX sometimes hangs in combination with TOR |
||
24 | # return qx{lynx -connect_timeout=10 -source $url}; |
||
92 | daniel-mar | 25 | return qx{curl --insecure --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36" --silent --max-time 10 $url}; |
12 | daniel-mar | 26 | } |
27 | |||
92 | daniel-mar | 28 | sub VGWhoIs::Utils::lynxrender { |
29 | my ($url) = @_; |
||
30 | $url = quotemeta($url); |
||
31 | |||
32 | use File::Basename; |
||
33 | my $script_dir = undef; |
||
34 | if(-l __FILE__) { |
||
35 | $script_dir = dirname(readlink(__FILE__)); |
||
36 | } else { |
||
37 | $script_dir = dirname(__FILE__); |
||
38 | } |
||
39 | $script_dir = quotemeta($script_dir); |
||
40 | |||
41 | my $result = qx{lynx -cfg $script_dir/../lynx.cfg -dump -connect_timeout=10 $url 2>&1}; |
||
42 | $result .= "FAILED with exit code $?\n\n" if $?; |
||
43 | return $result; |
||
44 | } |
||
45 | |||
12 | daniel-mar | 46 | # $line = htmlpre($line); |
47 | sub VGWhoIs::Utils::htmlpre { |
||
48 | my ($line) = @_; |
||
49 | $line =~ s|\n|<br>|g; |
||
50 | $line =~ s| | |g; |
||
51 | return $line; |
||
52 | } |
||
53 | |||
54 | # $rendered = VGWhoIs::Utils::render_html($html); |
||
55 | sub VGWhoIs::Utils::render_html { |
||
56 | my ($html) = @_; |
||
57 | |||
58 | return '' if !defined $html; |
||
59 | |||
60 | $html =~ s|<!--.*?-->||gsi; |
||
61 | |||
62 | $html =~ s|<pre>(.*?)</pre>|VGWhoIs::Utils::htmlpre($1)|gsei; |
||
63 | $html =~ s|<textarea>(.*?)</textarea>|VGWhoIs::Utils::htmlpre($1)|gsei; |
||
64 | |||
17 | daniel-mar | 65 | #TODO: big problem here: if the output is "content-type: text/plain", then we must not call render_html!!! |
12 | daniel-mar | 66 | $html =~ s|\n| |g; |
67 | |||
68 | $html =~ s|<p\s*/{0,1}\s*>|\n|gsi; |
||
69 | $html =~ s|<p\s.*?>|\n|gsi; |
||
70 | |||
71 | $html =~ s|<tr\s*/{0,1}\s*>|\n|gsi; |
||
72 | $html =~ s|<tr\s.*?>|\n|gsi; |
||
73 | $html =~ s|<td>| |gsi; |
||
74 | |||
75 | $html =~ s|<script.*?</script>||gsi; |
||
76 | $html =~ s|<style.*?</style>||gsi; |
||
77 | |||
78 | $html =~ s| \t| |gsi; |
||
79 | $html =~ s|\s*\n\s*\n|\n|gsi; |
||
80 | $html =~ s|^\s*||gm; |
||
81 | |||
82 | $html =~ s| | |gsi; |
||
83 | $html =~ s|<br\s*/{0,1}\s*>|\n|gsi; |
||
84 | $html =~ s|<br\s.*?>|\n|gsi; |
||
85 | $html =~ s|\<.*?\>||gsi; |
||
86 | |||
87 | return($html); |
||
88 | } |
||
89 | |||
90 | |||
91 | sub VGWhoIs::Utils::trim($) { |
||
92 | # Source: http://www.somacon.com/p114.php |
||
93 | my $string = shift; |
||
94 | $string =~ s/^\s+//; |
||
95 | $string =~ s/\s+$//; |
||
96 | return $string; # TODO: ein push faende ich besser |
||
97 | } |
||
98 | |||
99 | sub VGWhoIs::Utils::is_uc($) { |
||
100 | my $str = shift; |
||
101 | |||
102 | my $char; |
||
103 | foreach $char (split //, $str) { |
||
104 | return 1 if (ord($char) > 255); |
||
105 | } |
||
106 | |||
107 | return 0; |
||
108 | } |
||
109 | |||
110 | sub VGWhoIs::Utils::is_ascii($) { |
||
111 | my $str = shift; |
||
112 | |||
113 | my $char; |
||
114 | foreach $char (split //, $str) { |
||
115 | return 0 if (ord($char) >= 128); |
||
116 | } |
||
117 | |||
118 | return 1; |
||
119 | } |
||
120 | |||
121 | sub VGWhoIs::Utils::is_utf8($) { |
||
122 | my $str = shift; |
||
123 | |||
124 | my $s = eval { Encode::decode('utf8', $str, Encode::FB_CROAK) }; |
||
125 | return defined($s); |
||
126 | |||
127 | # This procedure does not work :-( VGWhoIs::Utils::is_utf8 and valid are true even if they should not... |
||
128 | # return 1 if utf8::VGWhoIs::Utils::is_utf8($str); |
||
129 | # return 0 if VGWhoIs::Utils::is_uc($str); |
||
130 | # return 1 if (Encode::Detect::Detector::detect($str) eq "UTF-8"); |
||
131 | # return utf8::valid($str); |
||
132 | } |
||
133 | |||
134 | sub VGWhoIs::Utils::enforce_utf8($) { |
||
135 | my $str = shift; |
||
136 | |||
137 | if (VGWhoIs::Utils::is_uc($str)) { |
||
138 | $str =~ s/^\x{FEFF}//; |
||
139 | utf8::encode($str); |
||
140 | } |
||
141 | elsif (!VGWhoIs::Utils::is_utf8($str)) { |
||
142 | $str =~ s/^\xEF\xBB\xBF//; |
||
143 | utf8::encode($str); |
||
144 | } |
||
145 | |||
146 | return $str; |
||
147 | } |
||
148 | |||
149 | # ($protocol, $hostname) = VGWhoIs::Utils::splitProtocolHost($url) |
||
150 | sub VGWhoIs::Utils::splitProtocolHost($) { |
||
151 | my $url = shift; |
||
152 | |||
153 | my ($protocol, $hostname) = $url =~ /(https{0,1}):\/\/([^\/]+)/; |
||
154 | |||
155 | return ($protocol, $hostname); |
||
156 | } |
||
157 | |||
158 | 1; |
||
159 |