Subversion Repositories javautils

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4 daniel-mar 1
/*
2
 * @(#)EmailSyntaxValidator.java
3
 *
4
 * Summary: Validate syntax of email addresses.
5
 *
6
 * Copyright: (c) 2002-2010 Roedy Green, Canadian Mind Products, http://mindprod.com
7
 *
8
 * Licence: This software may be copied and used freely for any purpose but military.
9
 *          http://mindprod.com/contact/nonmil.html
10
 *
11
 * Requires: JDK 1.5+
12
 *
13
 * Created with: IntelliJ IDEA IDE.
14
 *
15
 * Version History:
16
 *  1.7 2007-08-21
17
 */
18
package com.mindprod.bulk;
19
 
20
// Download newest version here:
21
// http://mindprod.com/products1.html#BULK
22
// SVN:
23
// http://wush.net/svn/mindprod/com/mindprod/bulk/EmailSyntaxValidator.java
24
 
25
// TODO: E-Mail-Aufbereiter... Puny, Trim
26
// TODO: BAD TLDS + PSEUDO (TOR: EXIT ETC)
27
// TODO: Awaiting bulk comit
28
 
29
// CHANGELOG BY DANIEL MARSCHALL
30
//
31
//Added ccTLDs
32
//
33
//.ax = Aland Islands
34
//.eu = European Union
35
//.me = Montenegro
36
//.rs = Serbia
37
//.su = Soviet Union (being phased out)
38
//.tl = Timor-Leste
39
//
40
//Deleted ccTLDs
41
//
42
//.bv = Bouvet Island [Allocated/unused]
43
//.eh = Western Sahara [Reserved/unassigned]
44
//.fx = UNKNOWN
45
//.gb = United Kingdom [Allocated/unused]
46
//.pm = Saint Pierre and Miquelon [Allocated/unused]
47
//.sj = Svalbard and Jan Mayen [Allocated/unused]
48
//.so = Somalia [Allocated/unused]
49
//.um = United States Minor Outlying Islands [Reserved/unassigned]
50
//.yt = Mayotte [Allocated/unused]
51
//.yu = Yugoslavia [Deleted/retired]
52
//
53
//Added BAD TLDs
54
//
55
//.example (RFC 2606)
56
//.localhost (RFC 2606)
57
//.test (RFC 2606)
58
//
59
//Added official TLDs
60
//
61
//.arpa (infrastructure TLD)
62
//.tel (sponsored TLD)               -- official TLD or rare TLD?
63
//.mobi (sponsored TLD)              -- official TLD or rare TLD?
64
//.jobs (sponsored TLD)              -- official TLD or rare TLD?
65
//.cat (sponsored TLD)               -- official TLD or rare TLD?
66
//
67
//Other changes
68
//
69
//* Commented out unused debugging stuff
70
//* Removed main procedure and syso import
71
 
72
import javax.mail.internet.AddressException;
73
import javax.mail.internet.InternetAddress;
74
import java.util.Arrays;
75
import java.util.HashSet;
76
import java.util.Locale;
77
import java.util.regex.Matcher;
78
import java.util.regex.Pattern;
79
 
80
/**
81
 * Validate syntax of email addresses.
82
 * <p/>
83
 * Does not probe to see if mailserver exists in DNS or online. See MailProber
84
 * for that. See ValidateEmailFile for an example of how to use this class.
85
 *
86
 * @author Roedy Green, Canadian Mind Products
87
 * @version 1.7 2007-08-21
88
 * @since 2002
89
 */
90
// TODO: @version check validity of & in first part of email address. Appears in
91
// practice.
92
 
93
public final class EmailSyntaxValidator {
94
        // ------------------------------ CONSTANTS ------------------------------
95
 
96
        /**
97
         * True if want extra debugging output.
98
         */
99
        // @SuppressWarnings( { "UnusedDeclaration" })
100
        // private static final boolean DEBUGGING = false;
101
 
102
        /**
103
         * Country where this program is running.
104
         */
105
        private static final String THIS_COUNTRY = Locale.getDefault().getCountry()
106
                        .toLowerCase();
107
 
108
        /**
109
         * Bad top level domains -- ones never valid.
110
         */
111
        private static final HashSet<String> BAD_TLDS = hmaker(new String[] {
112
                        "invalid", "nowhere", "noone", "test", "example", "localhost", });
113
 
114
        /**
115
         * Top level domains for countries.
116
         */
117
        private static final HashSet<String> NATIONAL_TLDS = hmaker(new String[] {
118
                        "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
119
                        "ar", "as", "at", "au", "aw", "ax", "az", "ba", "bb", "bd", "be",
120
                        "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt",
121
                        "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck",
122
                        "cl", "cm", "cn", "co", "cr", "cu", "cv", "cx", "cy", "cz", "de",
123
                        "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "er", "es", "et",
124
                        "eu", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge",
125
                        "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs",
126
                        "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id",
127
                        "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm",
128
                        "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw",
129
                        "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu",
130
                        "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm",
131
                        "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx",
132
                        "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np",
133
                        "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl",
134
                        "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", "ru",
135
                        "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sk", "sl",
136
                        "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf",
137
                        "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "tt",
138
                        "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc",
139
                        "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", });
140
 
141
        /**
142
         * Official top level domains.
143
         */
144
        private static final HashSet<String> OFFICIAL_TLDS = hmaker(new String[] {
145
                        "aero", "biz", "coop", "com", "edu", "gov", "info", "mil",
146
                        "museum", "name", "net", "org", "pro", "tel", "mobi", "jobs",
147
                        "cat", "arpa", });
148
 
149
        /**
150
         * Rarely used top level domains
151
         */
152
        private static final HashSet<String> RARE_TLDS = hmaker(new String[] {
153
                        "cam", "mp3", "agent", "art", "arts", "asia", "auction", "aus",
154
                        "bank", "cam", "chat", "church", "club", "corp", "dds", "design",
155
                        "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed",
156
                        "film", "firm", "free", "fun", "g", "game", "games", "gay", "ger",
157
                        "globe", "gmbh", "golf", "gov", "help", "hola", "i", "inc", "int",
158
                        "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp", "lnx",
159
                        "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic",
160
                        "nom", "npo", "per", "pol", "prices", "radio", "rsc", "school",
161
                        "scifi", "sea", "service", "sex", "shop", "sky", "soc", "space",
162
                        "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine",
163
                        "wir", "wired", "zine", "zoo", });
164
 
165
        /**
166
         * regex to allow dots anywhere, but not at start of domain name, no +
167
         */
168
        private static final Pattern p3 = Pattern
169
                        .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
170
 
171
        /**
172
         * regex IP style names, no +
173
         */
174
        private static final Pattern p4 = Pattern
175
                        .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]");
176
 
177
        /**
178
         * regex to allow - _ dots in name, no +
179
         */
180
        private static final Pattern p5 = Pattern
181
                        .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
182
 
183
        /**
184
         * regex to allow _ - in name, lead and trailing ones are filtered later, no
185
         * +.
186
         */
187
        private static final Pattern p9 = Pattern
188
                        .compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
189
 
190
        /**
191
         * regex to split into fields
192
         */
193
        private static final Pattern splitter = Pattern.compile("[@\\.]");
194
 
195
        // -------------------------- PUBLIC STATIC METHODS
196
        // --------------------------
197
 
198
        /**
199
         * Check how likely an email address is to be valid. The higher the number
200
         * returned, the more likely the address is valid. This method does not
201
         * probe the internet in any way to see if the corresponding mail server or
202
         * domain exists.
203
         *
204
         * @param email
205
         *            bare computer email address. e.g. roedyg@mindprod.com No
206
         *            "Roedy Green" <roedyg@mindprod.com> style addresses. No local
207
         *            addresses, e.g. roedy.
208
         *
209
         * @return <ul>
210
         *         <li>0 = email address is definitely malformed, e.g. missing
211
         * @. ends in .invalid</li> <li>1 = address does not meet one of the valid
212
         *    patterns below. It still might be ok according to some obscure rule in
213
         *    RFC 822 Java InternetAddress accepts it as valid.</li> <li>2 = unknown
214
         *    top level domain.</li> <li>3 = dots at beginning or end, doubled in
215
         *    name.</li> <li>4 = address of form xxx@[209.139.205.2] using IP</li>
216
         *    <li>5 = address of form xxx.xxx.xxx@xxx.xxx.xxx Dots _ or - in first
217
         *    part of name</li> <li>6 = addreess of form xxx@xxx.xxx.xxx rare, but
218
         *    known, domain</li> <li>7 = address of form xxx@xxx.xxx.ca or any
219
         *    national suffix.</li> <li>8 = address of form xxx@xxx.xxx.xx the
220
         *    matching this national suffix, e.g. .ca in Canada, .de in Germany</li>
221
         *    <li>9 = address of form xxx@xxx.xxx.com .org .net .edu .gov .biz --
222
         *    official domains</li>
223
         *    </ul>
224
         */
225
        public static int howValid(String email) {
226
                if (email == null) {
227
                        return 0;
228
                }
229
                email = email.trim().toLowerCase();
230
                int dotPlace = email.lastIndexOf('.');
231
                if (0 < dotPlace && dotPlace < email.length() - 1) {
232
                        String tld = email.substring(dotPlace + 1);
233
                        if (BAD_TLDS.contains(tld)) {
234
                                /* deliberate invalid address */
235
                                return 0;
236
                        }
237
                        // make sure none of fragments start or end in _ or -
238
                        String[] fragments = splitter.split(email);
239
                        boolean clean = true;
240
                        for (String fragment : fragments) {
241
                                if (fragment.startsWith("_") || fragment.endsWith("_")
242
                                                || fragment.startsWith("-") || fragment.endsWith("-")) {
243
                                        clean = false;
244
                                        break;
245
                                }
246
                        }// end for
247
                        if (clean) {
248
                                Matcher m9 = p9.matcher(email);
249
                                if (m9.matches()) {
250
                                        if (OFFICIAL_TLDS.contains(tld)) {
251
                                                return 9;
252
                                        } else if (THIS_COUNTRY.equals(tld)) {
253
                                                return 8;
254
                                        } else if (NATIONAL_TLDS.contains(tld)) {
255
                                                return 7;
256
                                        } else if (RARE_TLDS.contains(tld)) {
257
                                                return 6;
258
                                        } else {
259
                                                // TODO: Why is that 3 and not 2?
260
                                                return 3;/* unknown tld */
261
                                        }
262
                                }
263
                                // allow dots in name
264
                                Matcher m5 = p5.matcher(email);
265
                                if (m5.matches()) {
266
                                        if (OFFICIAL_TLDS.contains(tld)) {
267
                                                return 5;
268
                                        } else if (THIS_COUNTRY.equals(tld)) {
269
                                                return 5;
270
                                        } else if (NATIONAL_TLDS.contains(tld)) {
271
                                                return 5;
272
                                        } else if (RARE_TLDS.contains(tld)) {
273
                                                return 5;
274
                                        } else {
275
                                                return 2;/* unknown tld */
276
                                        }
277
                                }
278
 
279
                                // IP
280
                                Matcher m4 = p4.matcher(email);
281
                                if (m4.matches()) {
282
                                        return 4;/* can't tell TLD */
283
                                }
284
 
285
                                // allow even lead/trail dots in name, except at start of domain
286
                                Matcher m3 = p3.matcher(email);
287
                                if (m3.matches()) {
288
                                        if (OFFICIAL_TLDS.contains(tld)) {
289
                                                return 3;
290
                                        } else if (THIS_COUNTRY.equals(tld)) {
291
                                                return 3;
292
                                        } else if (NATIONAL_TLDS.contains(tld)) {
293
                                                return 3;
294
                                        } else if (RARE_TLDS.contains(tld)) {
295
                                                return 3;
296
                                        } else {
297
                                                return 2;/* unknown domain */
298
                                        }
299
                                }
300
                        }// end if clean
301
                }
302
                // allow even unclean addresses, and addresses without a TLD to have a
303
                // whack at passing RFC:822
304
                try {
305
                        /*
306
                         * see if InternetAddress likes it, it follows RFC:822. It will
307
                         * names without domains though.
308
                         */
309
                        InternetAddress.parse(email, true/* strict */);
310
                        // it liked it, no exception happened. Seems very sloppy.
311
                        return 1;
312
                } catch (AddressException e) {
313
                        // it did not like it
314
                        return 0;
315
                }
316
        }
317
 
318
        // -------------------------- STATIC METHODS --------------------------
319
 
320
        /**
321
         * build a HashSet from a array of String literals.
322
         *
323
         * @param list
324
         *            array of strings
325
         *
326
         * @return HashSet you can use to test if a string is in the set.
327
         */
328
        private static HashSet<String> hmaker(String[] list) {
329
                HashSet<String> map = new HashSet<String>(Math.max(
330
                                (int) (list.length / .75f) + 1, 16));
331
                map.addAll(Arrays.asList(list));
332
                return map;
333
        }
334
 
335
        // --------------------------- main() method ---------------------------
336
 
337
        /**
338
         * main debugging harness.
339
         *
340
         * @param args
341
         *            not used
342
         */
343
        // public static void main(String[] args) {
344
        // out.println(howValid("kellizer@.hotmail.com"));
345
        // }
346
}