Subversion Repositories javautils

Rev

Rev 4 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4 daniel-mar 1
/*
2
 * @(#)EmailSyntaxValidator.java
3
 *
4
 * Summary: Validate syntax of email addresses.
5
 *
6
 * Copyright: (c) 2002-2010 Roedy Green, Canadian Mind Products, http://mindprod.com
7
 *
8
 * Licence: This software may be copied and used freely for any purpose but military.
9
 *          http://mindprod.com/contact/nonmil.html
10
 *
11
 * Requires: JDK 1.5+
12
 *
13
 * Created with: IntelliJ IDEA IDE.
14
 *
15
 * Version History:
16
 *  1.7 2007-08-21
17
 */
18
package com.mindprod.bulk;
19
 
20
// Download newest version here:
21
// http://mindprod.com/products1.html#BULK
22
// SVN:
23
// http://wush.net/svn/mindprod/com/mindprod/bulk/EmailSyntaxValidator.java
24
 
25
// TODO: BAD TLDS + PSEUDO (TOR: EXIT ETC)
26 daniel-mar 26
// TODO: Awaiting official commit for this Patch
4 daniel-mar 27
 
28
// CHANGELOG BY DANIEL MARSCHALL
29
//
30
//Added ccTLDs
31
//
32
//.ax = Aland Islands
33
//.eu = European Union
34
//.me = Montenegro
35
//.rs = Serbia
36
//.su = Soviet Union (being phased out)
37
//.tl = Timor-Leste
38
//
39
//Deleted ccTLDs
40
//
41
//.bv = Bouvet Island [Allocated/unused]
42
//.eh = Western Sahara [Reserved/unassigned]
43
//.fx = UNKNOWN
44
//.gb = United Kingdom [Allocated/unused]
45
//.pm = Saint Pierre and Miquelon [Allocated/unused]
46
//.sj = Svalbard and Jan Mayen [Allocated/unused]
47
//.so = Somalia [Allocated/unused]
48
//.um = United States Minor Outlying Islands [Reserved/unassigned]
49
//.yt = Mayotte [Allocated/unused]
50
//.yu = Yugoslavia [Deleted/retired]
51
//
52
//Added BAD TLDs
53
//
54
//.example (RFC 2606)
55
//.localhost (RFC 2606)
56
//.test (RFC 2606)
57
//
58
//Added official TLDs
59
//
60
//.arpa (infrastructure TLD)
61
//.tel (sponsored TLD)               -- official TLD or rare TLD?
62
//.mobi (sponsored TLD)              -- official TLD or rare TLD?
63
//.jobs (sponsored TLD)              -- official TLD or rare TLD?
64
//.cat (sponsored TLD)               -- official TLD or rare TLD?
65
//
66
//Other changes
67
//
68
//* Commented out unused debugging stuff
69
//* Removed main procedure and syso import
70
 
71
import javax.mail.internet.AddressException;
72
import javax.mail.internet.InternetAddress;
73
import java.util.Arrays;
74
import java.util.HashSet;
75
import java.util.Locale;
76
import java.util.regex.Matcher;
77
import java.util.regex.Pattern;
78
 
79
/**
80
 * Validate syntax of email addresses.
81
 * <p/>
82
 * Does not probe to see if mailserver exists in DNS or online. See MailProber
83
 * for that. See ValidateEmailFile for an example of how to use this class.
84
 *
85
 * @author Roedy Green, Canadian Mind Products
86
 * @version 1.7 2007-08-21
87
 * @since 2002
88
 */
89
// TODO: @version check validity of & in first part of email address. Appears in
90
// practice.
91
 
92
public final class EmailSyntaxValidator {
93
        // ------------------------------ CONSTANTS ------------------------------
94
 
95
        /**
96
         * True if want extra debugging output.
97
         */
98
        // @SuppressWarnings( { "UnusedDeclaration" })
99
        // private static final boolean DEBUGGING = false;
100
 
101
        /**
102
         * Country where this program is running.
103
         */
104
        private static final String THIS_COUNTRY = Locale.getDefault().getCountry()
105
                        .toLowerCase();
106
 
107
        /**
108
         * Bad top level domains -- ones never valid.
109
         */
110
        private static final HashSet<String> BAD_TLDS = hmaker(new String[] {
111
                        "invalid", "nowhere", "noone", "test", "example", "localhost", });
112
 
113
        /**
114
         * Top level domains for countries.
115
         */
116
        private static final HashSet<String> NATIONAL_TLDS = hmaker(new String[] {
117
                        "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
118
                        "ar", "as", "at", "au", "aw", "ax", "az", "ba", "bb", "bd", "be",
119
                        "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt",
120
                        "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck",
121
                        "cl", "cm", "cn", "co", "cr", "cu", "cv", "cx", "cy", "cz", "de",
122
                        "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "er", "es", "et",
123
                        "eu", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge",
124
                        "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs",
125
                        "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id",
126
                        "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm",
127
                        "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw",
128
                        "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu",
129
                        "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm",
130
                        "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx",
131
                        "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np",
132
                        "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl",
133
                        "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", "ru",
134
                        "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sk", "sl",
135
                        "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf",
136
                        "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "tt",
137
                        "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc",
138
                        "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", });
139
 
140
        /**
141
         * Official top level domains.
142
         */
143
        private static final HashSet<String> OFFICIAL_TLDS = hmaker(new String[] {
144
                        "aero", "biz", "coop", "com", "edu", "gov", "info", "mil",
145
                        "museum", "name", "net", "org", "pro", "tel", "mobi", "jobs",
146
                        "cat", "arpa", });
147
 
148
        /**
149
         * Rarely used top level domains
150
         */
151
        private static final HashSet<String> RARE_TLDS = hmaker(new String[] {
152
                        "cam", "mp3", "agent", "art", "arts", "asia", "auction", "aus",
153
                        "bank", "cam", "chat", "church", "club", "corp", "dds", "design",
154
                        "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed",
155
                        "film", "firm", "free", "fun", "g", "game", "games", "gay", "ger",
156
                        "globe", "gmbh", "golf", "gov", "help", "hola", "i", "inc", "int",
157
                        "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp", "lnx",
158
                        "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic",
159
                        "nom", "npo", "per", "pol", "prices", "radio", "rsc", "school",
160
                        "scifi", "sea", "service", "sex", "shop", "sky", "soc", "space",
161
                        "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine",
162
                        "wir", "wired", "zine", "zoo", });
163
 
164
        /**
165
         * regex to allow dots anywhere, but not at start of domain name, no +
166
         */
167
        private static final Pattern p3 = Pattern
168
                        .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
169
 
170
        /**
171
         * regex IP style names, no +
172
         */
173
        private static final Pattern p4 = Pattern
174
                        .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]");
175
 
176
        /**
177
         * regex to allow - _ dots in name, no +
178
         */
179
        private static final Pattern p5 = Pattern
180
                        .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
181
 
182
        /**
183
         * regex to allow _ - in name, lead and trailing ones are filtered later, no
184
         * +.
185
         */
186
        private static final Pattern p9 = Pattern
187
                        .compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
188
 
189
        /**
190
         * regex to split into fields
191
         */
192
        private static final Pattern splitter = Pattern.compile("[@\\.]");
193
 
194
        // -------------------------- PUBLIC STATIC METHODS
195
        // --------------------------
196
 
197
        /**
198
         * Check how likely an email address is to be valid. The higher the number
199
         * returned, the more likely the address is valid. This method does not
200
         * probe the internet in any way to see if the corresponding mail server or
201
         * domain exists.
202
         *
203
         * @param email
204
         *            bare computer email address. e.g. roedyg@mindprod.com No
205
         *            "Roedy Green" <roedyg@mindprod.com> style addresses. No local
206
         *            addresses, e.g. roedy.
207
         *
208
         * @return <ul>
209
         *         <li>0 = email address is definitely malformed, e.g. missing
210
         * @. ends in .invalid</li> <li>1 = address does not meet one of the valid
211
         *    patterns below. It still might be ok according to some obscure rule in
212
         *    RFC 822 Java InternetAddress accepts it as valid.</li> <li>2 = unknown
213
         *    top level domain.</li> <li>3 = dots at beginning or end, doubled in
214
         *    name.</li> <li>4 = address of form xxx@[209.139.205.2] using IP</li>
215
         *    <li>5 = address of form xxx.xxx.xxx@xxx.xxx.xxx Dots _ or - in first
216
         *    part of name</li> <li>6 = addreess of form xxx@xxx.xxx.xxx rare, but
217
         *    known, domain</li> <li>7 = address of form xxx@xxx.xxx.ca or any
218
         *    national suffix.</li> <li>8 = address of form xxx@xxx.xxx.xx the
219
         *    matching this national suffix, e.g. .ca in Canada, .de in Germany</li>
220
         *    <li>9 = address of form xxx@xxx.xxx.com .org .net .edu .gov .biz --
221
         *    official domains</li>
222
         *    </ul>
223
         */
224
        public static int howValid(String email) {
225
                if (email == null) {
226
                        return 0;
227
                }
228
                email = email.trim().toLowerCase();
229
                int dotPlace = email.lastIndexOf('.');
230
                if (0 < dotPlace && dotPlace < email.length() - 1) {
231
                        String tld = email.substring(dotPlace + 1);
232
                        if (BAD_TLDS.contains(tld)) {
233
                                /* deliberate invalid address */
234
                                return 0;
235
                        }
236
                        // make sure none of fragments start or end in _ or -
237
                        String[] fragments = splitter.split(email);
238
                        boolean clean = true;
239
                        for (String fragment : fragments) {
240
                                if (fragment.startsWith("_") || fragment.endsWith("_")
241
                                                || fragment.startsWith("-") || fragment.endsWith("-")) {
242
                                        clean = false;
243
                                        break;
244
                                }
245
                        }// end for
246
                        if (clean) {
247
                                Matcher m9 = p9.matcher(email);
248
                                if (m9.matches()) {
249
                                        if (OFFICIAL_TLDS.contains(tld)) {
250
                                                return 9;
251
                                        } else if (THIS_COUNTRY.equals(tld)) {
252
                                                return 8;
253
                                        } else if (NATIONAL_TLDS.contains(tld)) {
254
                                                return 7;
255
                                        } else if (RARE_TLDS.contains(tld)) {
256
                                                return 6;
257
                                        } else {
258
                                                // TODO: Why is that 3 and not 2?
259
                                                return 3;/* unknown tld */
260
                                        }
261
                                }
262
                                // allow dots in name
263
                                Matcher m5 = p5.matcher(email);
264
                                if (m5.matches()) {
265
                                        if (OFFICIAL_TLDS.contains(tld)) {
266
                                                return 5;
267
                                        } else if (THIS_COUNTRY.equals(tld)) {
268
                                                return 5;
269
                                        } else if (NATIONAL_TLDS.contains(tld)) {
270
                                                return 5;
271
                                        } else if (RARE_TLDS.contains(tld)) {
272
                                                return 5;
273
                                        } else {
274
                                                return 2;/* unknown tld */
275
                                        }
276
                                }
277
 
278
                                // IP
279
                                Matcher m4 = p4.matcher(email);
280
                                if (m4.matches()) {
281
                                        return 4;/* can't tell TLD */
282
                                }
283
 
284
                                // allow even lead/trail dots in name, except at start of domain
285
                                Matcher m3 = p3.matcher(email);
286
                                if (m3.matches()) {
287
                                        if (OFFICIAL_TLDS.contains(tld)) {
288
                                                return 3;
289
                                        } else if (THIS_COUNTRY.equals(tld)) {
290
                                                return 3;
291
                                        } else if (NATIONAL_TLDS.contains(tld)) {
292
                                                return 3;
293
                                        } else if (RARE_TLDS.contains(tld)) {
294
                                                return 3;
295
                                        } else {
296
                                                return 2;/* unknown domain */
297
                                        }
298
                                }
299
                        }// end if clean
300
                }
301
                // allow even unclean addresses, and addresses without a TLD to have a
302
                // whack at passing RFC:822
303
                try {
304
                        /*
305
                         * see if InternetAddress likes it, it follows RFC:822. It will
306
                         * names without domains though.
307
                         */
308
                        InternetAddress.parse(email, true/* strict */);
309
                        // it liked it, no exception happened. Seems very sloppy.
310
                        return 1;
311
                } catch (AddressException e) {
312
                        // it did not like it
313
                        return 0;
314
                }
315
        }
316
 
317
        // -------------------------- STATIC METHODS --------------------------
318
 
319
        /**
320
         * build a HashSet from a array of String literals.
321
         *
322
         * @param list
323
         *            array of strings
324
         *
325
         * @return HashSet you can use to test if a string is in the set.
326
         */
327
        private static HashSet<String> hmaker(String[] list) {
328
                HashSet<String> map = new HashSet<String>(Math.max(
329
                                (int) (list.length / .75f) + 1, 16));
330
                map.addAll(Arrays.asList(list));
331
                return map;
332
        }
333
 
334
        // --------------------------- main() method ---------------------------
335
 
336
        /**
337
         * main debugging harness.
338
         *
339
         * @param args
340
         *            not used
341
         */
342
        // public static void main(String[] args) {
343
        // out.println(howValid("kellizer@.hotmail.com"));
344
        // }
345
}