Rev 4 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4 | daniel-mar | 1 | /* |
2 | * @(#)EmailSyntaxValidator.java |
||
3 | * |
||
4 | * Summary: Validate syntax of email addresses. |
||
5 | * |
||
6 | * Copyright: (c) 2002-2010 Roedy Green, Canadian Mind Products, http://mindprod.com |
||
7 | * |
||
8 | * Licence: This software may be copied and used freely for any purpose but military. |
||
9 | * http://mindprod.com/contact/nonmil.html |
||
10 | * |
||
11 | * Requires: JDK 1.5+ |
||
12 | * |
||
13 | * Created with: IntelliJ IDEA IDE. |
||
14 | * |
||
15 | * Version History: |
||
16 | * 1.7 2007-08-21 |
||
17 | */ |
||
18 | package com.mindprod.bulk; |
||
19 | |||
20 | // Download newest version here: |
||
21 | // http://mindprod.com/products1.html#BULK |
||
22 | // SVN: |
||
23 | // http://wush.net/svn/mindprod/com/mindprod/bulk/EmailSyntaxValidator.java |
||
24 | |||
25 | // TODO: BAD TLDS + PSEUDO (TOR: EXIT ETC) |
||
26 | daniel-mar | 26 | // TODO: Awaiting official commit for this Patch |
4 | daniel-mar | 27 | |
28 | // CHANGELOG BY DANIEL MARSCHALL |
||
29 | // |
||
30 | //Added ccTLDs |
||
31 | // |
||
32 | //.ax = Aland Islands |
||
33 | //.eu = European Union |
||
34 | //.me = Montenegro |
||
35 | //.rs = Serbia |
||
36 | //.su = Soviet Union (being phased out) |
||
37 | //.tl = Timor-Leste |
||
38 | // |
||
39 | //Deleted ccTLDs |
||
40 | // |
||
41 | //.bv = Bouvet Island [Allocated/unused] |
||
42 | //.eh = Western Sahara [Reserved/unassigned] |
||
43 | //.fx = UNKNOWN |
||
44 | //.gb = United Kingdom [Allocated/unused] |
||
45 | //.pm = Saint Pierre and Miquelon [Allocated/unused] |
||
46 | //.sj = Svalbard and Jan Mayen [Allocated/unused] |
||
47 | //.so = Somalia [Allocated/unused] |
||
48 | //.um = United States Minor Outlying Islands [Reserved/unassigned] |
||
49 | //.yt = Mayotte [Allocated/unused] |
||
50 | //.yu = Yugoslavia [Deleted/retired] |
||
51 | // |
||
52 | //Added BAD TLDs |
||
53 | // |
||
54 | //.example (RFC 2606) |
||
55 | //.localhost (RFC 2606) |
||
56 | //.test (RFC 2606) |
||
57 | // |
||
58 | //Added official TLDs |
||
59 | // |
||
60 | //.arpa (infrastructure TLD) |
||
61 | //.tel (sponsored TLD) -- official TLD or rare TLD? |
||
62 | //.mobi (sponsored TLD) -- official TLD or rare TLD? |
||
63 | //.jobs (sponsored TLD) -- official TLD or rare TLD? |
||
64 | //.cat (sponsored TLD) -- official TLD or rare TLD? |
||
65 | // |
||
66 | //Other changes |
||
67 | // |
||
68 | //* Commented out unused debugging stuff |
||
69 | //* Removed main procedure and syso import |
||
70 | |||
71 | import javax.mail.internet.AddressException; |
||
72 | import javax.mail.internet.InternetAddress; |
||
73 | import java.util.Arrays; |
||
74 | import java.util.HashSet; |
||
75 | import java.util.Locale; |
||
76 | import java.util.regex.Matcher; |
||
77 | import java.util.regex.Pattern; |
||
78 | |||
79 | /** |
||
80 | * Validate syntax of email addresses. |
||
81 | * <p/> |
||
82 | * Does not probe to see if mailserver exists in DNS or online. See MailProber |
||
83 | * for that. See ValidateEmailFile for an example of how to use this class. |
||
84 | * |
||
85 | * @author Roedy Green, Canadian Mind Products |
||
86 | * @version 1.7 2007-08-21 |
||
87 | * @since 2002 |
||
88 | */ |
||
89 | // TODO: @version check validity of & in first part of email address. Appears in |
||
90 | // practice. |
||
91 | |||
92 | public final class EmailSyntaxValidator { |
||
93 | // ------------------------------ CONSTANTS ------------------------------ |
||
94 | |||
95 | /** |
||
96 | * True if want extra debugging output. |
||
97 | */ |
||
98 | // @SuppressWarnings( { "UnusedDeclaration" }) |
||
99 | // private static final boolean DEBUGGING = false; |
||
100 | |||
101 | /** |
||
102 | * Country where this program is running. |
||
103 | */ |
||
104 | private static final String THIS_COUNTRY = Locale.getDefault().getCountry() |
||
105 | .toLowerCase(); |
||
106 | |||
107 | /** |
||
108 | * Bad top level domains -- ones never valid. |
||
109 | */ |
||
110 | private static final HashSet<String> BAD_TLDS = hmaker(new String[] { |
||
111 | "invalid", "nowhere", "noone", "test", "example", "localhost", }); |
||
112 | |||
113 | /** |
||
114 | * Top level domains for countries. |
||
115 | */ |
||
116 | private static final HashSet<String> NATIONAL_TLDS = hmaker(new String[] { |
||
117 | "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", |
||
118 | "ar", "as", "at", "au", "aw", "ax", "az", "ba", "bb", "bd", "be", |
||
119 | "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt", |
||
120 | "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", |
||
121 | "cl", "cm", "cn", "co", "cr", "cu", "cv", "cx", "cy", "cz", "de", |
||
122 | "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "er", "es", "et", |
||
123 | "eu", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge", |
||
124 | "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", |
||
125 | "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id", |
||
126 | "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm", |
||
127 | "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", |
||
128 | "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", |
||
129 | "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm", |
||
130 | "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", |
||
131 | "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", |
||
132 | "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl", |
||
133 | "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", "ru", |
||
134 | "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sk", "sl", |
||
135 | "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf", |
||
136 | "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "tt", |
||
137 | "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", |
||
138 | "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", }); |
||
139 | |||
140 | /** |
||
141 | * Official top level domains. |
||
142 | */ |
||
143 | private static final HashSet<String> OFFICIAL_TLDS = hmaker(new String[] { |
||
144 | "aero", "biz", "coop", "com", "edu", "gov", "info", "mil", |
||
145 | "museum", "name", "net", "org", "pro", "tel", "mobi", "jobs", |
||
146 | "cat", "arpa", }); |
||
147 | |||
148 | /** |
||
149 | * Rarely used top level domains |
||
150 | */ |
||
151 | private static final HashSet<String> RARE_TLDS = hmaker(new String[] { |
||
152 | "cam", "mp3", "agent", "art", "arts", "asia", "auction", "aus", |
||
153 | "bank", "cam", "chat", "church", "club", "corp", "dds", "design", |
||
154 | "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed", |
||
155 | "film", "firm", "free", "fun", "g", "game", "games", "gay", "ger", |
||
156 | "globe", "gmbh", "golf", "gov", "help", "hola", "i", "inc", "int", |
||
157 | "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp", "lnx", |
||
158 | "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic", |
||
159 | "nom", "npo", "per", "pol", "prices", "radio", "rsc", "school", |
||
160 | "scifi", "sea", "service", "sex", "shop", "sky", "soc", "space", |
||
161 | "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine", |
||
162 | "wir", "wired", "zine", "zoo", }); |
||
163 | |||
164 | /** |
||
165 | * regex to allow dots anywhere, but not at start of domain name, no + |
||
166 | */ |
||
167 | private static final Pattern p3 = Pattern |
||
168 | .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); |
||
169 | |||
170 | /** |
||
171 | * regex IP style names, no + |
||
172 | */ |
||
173 | private static final Pattern p4 = Pattern |
||
174 | .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]"); |
||
175 | |||
176 | /** |
||
177 | * regex to allow - _ dots in name, no + |
||
178 | */ |
||
179 | private static final Pattern p5 = Pattern |
||
180 | .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); |
||
181 | |||
182 | /** |
||
183 | * regex to allow _ - in name, lead and trailing ones are filtered later, no |
||
184 | * +. |
||
185 | */ |
||
186 | private static final Pattern p9 = Pattern |
||
187 | .compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); |
||
188 | |||
189 | /** |
||
190 | * regex to split into fields |
||
191 | */ |
||
192 | private static final Pattern splitter = Pattern.compile("[@\\.]"); |
||
193 | |||
194 | // -------------------------- PUBLIC STATIC METHODS |
||
195 | // -------------------------- |
||
196 | |||
197 | /** |
||
198 | * Check how likely an email address is to be valid. The higher the number |
||
199 | * returned, the more likely the address is valid. This method does not |
||
200 | * probe the internet in any way to see if the corresponding mail server or |
||
201 | * domain exists. |
||
202 | * |
||
203 | * @param email |
||
204 | * bare computer email address. e.g. roedyg@mindprod.com No |
||
205 | * "Roedy Green" <roedyg@mindprod.com> style addresses. No local |
||
206 | * addresses, e.g. roedy. |
||
207 | * |
||
208 | * @return <ul> |
||
209 | * <li>0 = email address is definitely malformed, e.g. missing |
||
210 | * @. ends in .invalid</li> <li>1 = address does not meet one of the valid |
||
211 | * patterns below. It still might be ok according to some obscure rule in |
||
212 | * RFC 822 Java InternetAddress accepts it as valid.</li> <li>2 = unknown |
||
213 | * top level domain.</li> <li>3 = dots at beginning or end, doubled in |
||
214 | * name.</li> <li>4 = address of form xxx@[209.139.205.2] using IP</li> |
||
215 | * <li>5 = address of form xxx.xxx.xxx@xxx.xxx.xxx Dots _ or - in first |
||
216 | * part of name</li> <li>6 = addreess of form xxx@xxx.xxx.xxx rare, but |
||
217 | * known, domain</li> <li>7 = address of form xxx@xxx.xxx.ca or any |
||
218 | * national suffix.</li> <li>8 = address of form xxx@xxx.xxx.xx the |
||
219 | * matching this national suffix, e.g. .ca in Canada, .de in Germany</li> |
||
220 | * <li>9 = address of form xxx@xxx.xxx.com .org .net .edu .gov .biz -- |
||
221 | * official domains</li> |
||
222 | * </ul> |
||
223 | */ |
||
224 | public static int howValid(String email) { |
||
225 | if (email == null) { |
||
226 | return 0; |
||
227 | } |
||
228 | email = email.trim().toLowerCase(); |
||
229 | int dotPlace = email.lastIndexOf('.'); |
||
230 | if (0 < dotPlace && dotPlace < email.length() - 1) { |
||
231 | String tld = email.substring(dotPlace + 1); |
||
232 | if (BAD_TLDS.contains(tld)) { |
||
233 | /* deliberate invalid address */ |
||
234 | return 0; |
||
235 | } |
||
236 | // make sure none of fragments start or end in _ or - |
||
237 | String[] fragments = splitter.split(email); |
||
238 | boolean clean = true; |
||
239 | for (String fragment : fragments) { |
||
240 | if (fragment.startsWith("_") || fragment.endsWith("_") |
||
241 | || fragment.startsWith("-") || fragment.endsWith("-")) { |
||
242 | clean = false; |
||
243 | break; |
||
244 | } |
||
245 | }// end for |
||
246 | if (clean) { |
||
247 | Matcher m9 = p9.matcher(email); |
||
248 | if (m9.matches()) { |
||
249 | if (OFFICIAL_TLDS.contains(tld)) { |
||
250 | return 9; |
||
251 | } else if (THIS_COUNTRY.equals(tld)) { |
||
252 | return 8; |
||
253 | } else if (NATIONAL_TLDS.contains(tld)) { |
||
254 | return 7; |
||
255 | } else if (RARE_TLDS.contains(tld)) { |
||
256 | return 6; |
||
257 | } else { |
||
258 | // TODO: Why is that 3 and not 2? |
||
259 | return 3;/* unknown tld */ |
||
260 | } |
||
261 | } |
||
262 | // allow dots in name |
||
263 | Matcher m5 = p5.matcher(email); |
||
264 | if (m5.matches()) { |
||
265 | if (OFFICIAL_TLDS.contains(tld)) { |
||
266 | return 5; |
||
267 | } else if (THIS_COUNTRY.equals(tld)) { |
||
268 | return 5; |
||
269 | } else if (NATIONAL_TLDS.contains(tld)) { |
||
270 | return 5; |
||
271 | } else if (RARE_TLDS.contains(tld)) { |
||
272 | return 5; |
||
273 | } else { |
||
274 | return 2;/* unknown tld */ |
||
275 | } |
||
276 | } |
||
277 | |||
278 | // IP |
||
279 | Matcher m4 = p4.matcher(email); |
||
280 | if (m4.matches()) { |
||
281 | return 4;/* can't tell TLD */ |
||
282 | } |
||
283 | |||
284 | // allow even lead/trail dots in name, except at start of domain |
||
285 | Matcher m3 = p3.matcher(email); |
||
286 | if (m3.matches()) { |
||
287 | if (OFFICIAL_TLDS.contains(tld)) { |
||
288 | return 3; |
||
289 | } else if (THIS_COUNTRY.equals(tld)) { |
||
290 | return 3; |
||
291 | } else if (NATIONAL_TLDS.contains(tld)) { |
||
292 | return 3; |
||
293 | } else if (RARE_TLDS.contains(tld)) { |
||
294 | return 3; |
||
295 | } else { |
||
296 | return 2;/* unknown domain */ |
||
297 | } |
||
298 | } |
||
299 | }// end if clean |
||
300 | } |
||
301 | // allow even unclean addresses, and addresses without a TLD to have a |
||
302 | // whack at passing RFC:822 |
||
303 | try { |
||
304 | /* |
||
305 | * see if InternetAddress likes it, it follows RFC:822. It will |
||
306 | * names without domains though. |
||
307 | */ |
||
308 | InternetAddress.parse(email, true/* strict */); |
||
309 | // it liked it, no exception happened. Seems very sloppy. |
||
310 | return 1; |
||
311 | } catch (AddressException e) { |
||
312 | // it did not like it |
||
313 | return 0; |
||
314 | } |
||
315 | } |
||
316 | |||
317 | // -------------------------- STATIC METHODS -------------------------- |
||
318 | |||
319 | /** |
||
320 | * build a HashSet from a array of String literals. |
||
321 | * |
||
322 | * @param list |
||
323 | * array of strings |
||
324 | * |
||
325 | * @return HashSet you can use to test if a string is in the set. |
||
326 | */ |
||
327 | private static HashSet<String> hmaker(String[] list) { |
||
328 | HashSet<String> map = new HashSet<String>(Math.max( |
||
329 | (int) (list.length / .75f) + 1, 16)); |
||
330 | map.addAll(Arrays.asList(list)); |
||
331 | return map; |
||
332 | } |
||
333 | |||
334 | // --------------------------- main() method --------------------------- |
||
335 | |||
336 | /** |
||
337 | * main debugging harness. |
||
338 | * |
||
339 | * @param args |
||
340 | * not used |
||
341 | */ |
||
342 | // public static void main(String[] args) { |
||
343 | // out.println(howValid("kellizer@.hotmail.com")); |
||
344 | // } |
||
345 | } |