Subversion Repositories javautils

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. /*
  2.  * @(#)EmailSyntaxValidator.java
  3.  *
  4.  * Summary: Validate syntax of email addresses.
  5.  *
  6.  * Copyright: (c) 2002-2010 Roedy Green, Canadian Mind Products, http://mindprod.com
  7.  *
  8.  * Licence: This software may be copied and used freely for any purpose but military.
  9.  *          http://mindprod.com/contact/nonmil.html
  10.  *
  11.  * Requires: JDK 1.5+
  12.  *
  13.  * Created with: IntelliJ IDEA IDE.
  14.  *
  15.  * Version History:
  16.  *  1.7 2007-08-21
  17.  */
  18. package com.mindprod.bulk;
  19.  
  20. // Download newest version here:
  21. // http://mindprod.com/products1.html#BULK
  22. // SVN:
  23. // http://wush.net/svn/mindprod/com/mindprod/bulk/EmailSyntaxValidator.java
  24.  
  25. // TODO: E-Mail-Aufbereiter... Puny, Trim
  26. // TODO: BAD TLDS + PSEUDO (TOR: EXIT ETC)
  27. // TODO: Awaiting bulk comit
  28.  
  29. // CHANGELOG BY DANIEL MARSCHALL
  30. //
  31. //Added ccTLDs
  32. //
  33. //.ax = Aland Islands
  34. //.eu = European Union
  35. //.me = Montenegro
  36. //.rs = Serbia
  37. //.su = Soviet Union (being phased out)
  38. //.tl = Timor-Leste
  39. //
  40. //Deleted ccTLDs
  41. //
  42. //.bv = Bouvet Island [Allocated/unused]
  43. //.eh = Western Sahara [Reserved/unassigned]
  44. //.fx = UNKNOWN
  45. //.gb = United Kingdom [Allocated/unused]
  46. //.pm = Saint Pierre and Miquelon [Allocated/unused]
  47. //.sj = Svalbard and Jan Mayen [Allocated/unused]
  48. //.so = Somalia [Allocated/unused]
  49. //.um = United States Minor Outlying Islands [Reserved/unassigned]
  50. //.yt = Mayotte [Allocated/unused]
  51. //.yu = Yugoslavia [Deleted/retired]
  52. //
  53. //Added BAD TLDs
  54. //
  55. //.example (RFC 2606)
  56. //.localhost (RFC 2606)
  57. //.test (RFC 2606)
  58. //
  59. //Added official TLDs
  60. //
  61. //.arpa (infrastructure TLD)
  62. //.tel (sponsored TLD)               -- official TLD or rare TLD?
  63. //.mobi (sponsored TLD)              -- official TLD or rare TLD?
  64. //.jobs (sponsored TLD)              -- official TLD or rare TLD?
  65. //.cat (sponsored TLD)               -- official TLD or rare TLD?
  66. //
  67. //Other changes
  68. //
  69. //* Commented out unused debugging stuff
  70. //* Removed main procedure and syso import
  71.  
  72. import javax.mail.internet.AddressException;
  73. import javax.mail.internet.InternetAddress;
  74. import java.util.Arrays;
  75. import java.util.HashSet;
  76. import java.util.Locale;
  77. import java.util.regex.Matcher;
  78. import java.util.regex.Pattern;
  79.  
  80. /**
  81.  * Validate syntax of email addresses.
  82.  * <p/>
  83.  * Does not probe to see if mailserver exists in DNS or online. See MailProber
  84.  * for that. See ValidateEmailFile for an example of how to use this class.
  85.  *
  86.  * @author Roedy Green, Canadian Mind Products
  87.  * @version 1.7 2007-08-21
  88.  * @since 2002
  89.  */
  90. // TODO: @version check validity of & in first part of email address. Appears in
  91. // practice.
  92.  
  93. public final class EmailSyntaxValidator {
  94.         // ------------------------------ CONSTANTS ------------------------------
  95.  
  96.         /**
  97.          * True if want extra debugging output.
  98.          */
  99.         // @SuppressWarnings( { "UnusedDeclaration" })
  100.         // private static final boolean DEBUGGING = false;
  101.  
  102.         /**
  103.          * Country where this program is running.
  104.          */
  105.         private static final String THIS_COUNTRY = Locale.getDefault().getCountry()
  106.                         .toLowerCase();
  107.  
  108.         /**
  109.          * Bad top level domains -- ones never valid.
  110.          */
  111.         private static final HashSet<String> BAD_TLDS = hmaker(new String[] {
  112.                         "invalid", "nowhere", "noone", "test", "example", "localhost", });
  113.  
  114.         /**
  115.          * Top level domains for countries.
  116.          */
  117.         private static final HashSet<String> NATIONAL_TLDS = hmaker(new String[] {
  118.                         "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
  119.                         "ar", "as", "at", "au", "aw", "ax", "az", "ba", "bb", "bd", "be",
  120.                         "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt",
  121.                         "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck",
  122.                         "cl", "cm", "cn", "co", "cr", "cu", "cv", "cx", "cy", "cz", "de",
  123.                         "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "er", "es", "et",
  124.                         "eu", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge",
  125.                         "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs",
  126.                         "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id",
  127.                         "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm",
  128.                         "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw",
  129.                         "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu",
  130.                         "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm",
  131.                         "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx",
  132.                         "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np",
  133.                         "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl",
  134.                         "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", "ru",
  135.                         "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sk", "sl",
  136.                         "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf",
  137.                         "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "tt",
  138.                         "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc",
  139.                         "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", });
  140.  
  141.         /**
  142.          * Official top level domains.
  143.          */
  144.         private static final HashSet<String> OFFICIAL_TLDS = hmaker(new String[] {
  145.                         "aero", "biz", "coop", "com", "edu", "gov", "info", "mil",
  146.                         "museum", "name", "net", "org", "pro", "tel", "mobi", "jobs",
  147.                         "cat", "arpa", });
  148.  
  149.         /**
  150.          * Rarely used top level domains
  151.          */
  152.         private static final HashSet<String> RARE_TLDS = hmaker(new String[] {
  153.                         "cam", "mp3", "agent", "art", "arts", "asia", "auction", "aus",
  154.                         "bank", "cam", "chat", "church", "club", "corp", "dds", "design",
  155.                         "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed",
  156.                         "film", "firm", "free", "fun", "g", "game", "games", "gay", "ger",
  157.                         "globe", "gmbh", "golf", "gov", "help", "hola", "i", "inc", "int",
  158.                         "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp", "lnx",
  159.                         "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic",
  160.                         "nom", "npo", "per", "pol", "prices", "radio", "rsc", "school",
  161.                         "scifi", "sea", "service", "sex", "shop", "sky", "soc", "space",
  162.                         "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine",
  163.                         "wir", "wired", "zine", "zoo", });
  164.  
  165.         /**
  166.          * regex to allow dots anywhere, but not at start of domain name, no +
  167.          */
  168.         private static final Pattern p3 = Pattern
  169.                         .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
  170.  
  171.         /**
  172.          * regex IP style names, no +
  173.          */
  174.         private static final Pattern p4 = Pattern
  175.                         .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]");
  176.  
  177.         /**
  178.          * regex to allow - _ dots in name, no +
  179.          */
  180.         private static final Pattern p5 = Pattern
  181.                         .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
  182.  
  183.         /**
  184.          * regex to allow _ - in name, lead and trailing ones are filtered later, no
  185.          * +.
  186.          */
  187.         private static final Pattern p9 = Pattern
  188.                         .compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
  189.  
  190.         /**
  191.          * regex to split into fields
  192.          */
  193.         private static final Pattern splitter = Pattern.compile("[@\\.]");
  194.  
  195.         // -------------------------- PUBLIC STATIC METHODS
  196.         // --------------------------
  197.  
  198.         /**
  199.          * Check how likely an email address is to be valid. The higher the number
  200.          * returned, the more likely the address is valid. This method does not
  201.          * probe the internet in any way to see if the corresponding mail server or
  202.          * domain exists.
  203.          *
  204.          * @param email
  205.          *            bare computer email address. e.g. roedyg@mindprod.com No
  206.          *            "Roedy Green" <roedyg@mindprod.com> style addresses. No local
  207.          *            addresses, e.g. roedy.
  208.          *
  209.          * @return <ul>
  210.          *         <li>0 = email address is definitely malformed, e.g. missing
  211.          * @. ends in .invalid</li> <li>1 = address does not meet one of the valid
  212.          *    patterns below. It still might be ok according to some obscure rule in
  213.          *    RFC 822 Java InternetAddress accepts it as valid.</li> <li>2 = unknown
  214.          *    top level domain.</li> <li>3 = dots at beginning or end, doubled in
  215.          *    name.</li> <li>4 = address of form xxx@[209.139.205.2] using IP</li>
  216.          *    <li>5 = address of form xxx.xxx.xxx@xxx.xxx.xxx Dots _ or - in first
  217.          *    part of name</li> <li>6 = addreess of form xxx@xxx.xxx.xxx rare, but
  218.          *    known, domain</li> <li>7 = address of form xxx@xxx.xxx.ca or any
  219.          *    national suffix.</li> <li>8 = address of form xxx@xxx.xxx.xx the
  220.          *    matching this national suffix, e.g. .ca in Canada, .de in Germany</li>
  221.          *    <li>9 = address of form xxx@xxx.xxx.com .org .net .edu .gov .biz --
  222.          *    official domains</li>
  223.          *    </ul>
  224.          */
  225.         public static int howValid(String email) {
  226.                 if (email == null) {
  227.                         return 0;
  228.                 }
  229.                 email = email.trim().toLowerCase();
  230.                 int dotPlace = email.lastIndexOf('.');
  231.                 if (0 < dotPlace && dotPlace < email.length() - 1) {
  232.                         String tld = email.substring(dotPlace + 1);
  233.                         if (BAD_TLDS.contains(tld)) {
  234.                                 /* deliberate invalid address */
  235.                                 return 0;
  236.                         }
  237.                         // make sure none of fragments start or end in _ or -
  238.                         String[] fragments = splitter.split(email);
  239.                         boolean clean = true;
  240.                         for (String fragment : fragments) {
  241.                                 if (fragment.startsWith("_") || fragment.endsWith("_")
  242.                                                 || fragment.startsWith("-") || fragment.endsWith("-")) {
  243.                                         clean = false;
  244.                                         break;
  245.                                 }
  246.                         }// end for
  247.                         if (clean) {
  248.                                 Matcher m9 = p9.matcher(email);
  249.                                 if (m9.matches()) {
  250.                                         if (OFFICIAL_TLDS.contains(tld)) {
  251.                                                 return 9;
  252.                                         } else if (THIS_COUNTRY.equals(tld)) {
  253.                                                 return 8;
  254.                                         } else if (NATIONAL_TLDS.contains(tld)) {
  255.                                                 return 7;
  256.                                         } else if (RARE_TLDS.contains(tld)) {
  257.                                                 return 6;
  258.                                         } else {
  259.                                                 // TODO: Why is that 3 and not 2?
  260.                                                 return 3;/* unknown tld */
  261.                                         }
  262.                                 }
  263.                                 // allow dots in name
  264.                                 Matcher m5 = p5.matcher(email);
  265.                                 if (m5.matches()) {
  266.                                         if (OFFICIAL_TLDS.contains(tld)) {
  267.                                                 return 5;
  268.                                         } else if (THIS_COUNTRY.equals(tld)) {
  269.                                                 return 5;
  270.                                         } else if (NATIONAL_TLDS.contains(tld)) {
  271.                                                 return 5;
  272.                                         } else if (RARE_TLDS.contains(tld)) {
  273.                                                 return 5;
  274.                                         } else {
  275.                                                 return 2;/* unknown tld */
  276.                                         }
  277.                                 }
  278.  
  279.                                 // IP
  280.                                 Matcher m4 = p4.matcher(email);
  281.                                 if (m4.matches()) {
  282.                                         return 4;/* can't tell TLD */
  283.                                 }
  284.  
  285.                                 // allow even lead/trail dots in name, except at start of domain
  286.                                 Matcher m3 = p3.matcher(email);
  287.                                 if (m3.matches()) {
  288.                                         if (OFFICIAL_TLDS.contains(tld)) {
  289.                                                 return 3;
  290.                                         } else if (THIS_COUNTRY.equals(tld)) {
  291.                                                 return 3;
  292.                                         } else if (NATIONAL_TLDS.contains(tld)) {
  293.                                                 return 3;
  294.                                         } else if (RARE_TLDS.contains(tld)) {
  295.                                                 return 3;
  296.                                         } else {
  297.                                                 return 2;/* unknown domain */
  298.                                         }
  299.                                 }
  300.                         }// end if clean
  301.                 }
  302.                 // allow even unclean addresses, and addresses without a TLD to have a
  303.                 // whack at passing RFC:822
  304.                 try {
  305.                         /*
  306.                          * see if InternetAddress likes it, it follows RFC:822. It will
  307.                          * names without domains though.
  308.                          */
  309.                         InternetAddress.parse(email, true/* strict */);
  310.                         // it liked it, no exception happened. Seems very sloppy.
  311.                         return 1;
  312.                 } catch (AddressException e) {
  313.                         // it did not like it
  314.                         return 0;
  315.                 }
  316.         }
  317.  
  318.         // -------------------------- STATIC METHODS --------------------------
  319.  
  320.         /**
  321.          * build a HashSet from a array of String literals.
  322.          *
  323.          * @param list
  324.          *            array of strings
  325.          *
  326.          * @return HashSet you can use to test if a string is in the set.
  327.          */
  328.         private static HashSet<String> hmaker(String[] list) {
  329.                 HashSet<String> map = new HashSet<String>(Math.max(
  330.                                 (int) (list.length / .75f) + 1, 16));
  331.                 map.addAll(Arrays.asList(list));
  332.                 return map;
  333.         }
  334.  
  335.         // --------------------------- main() method ---------------------------
  336.  
  337.         /**
  338.          * main debugging harness.
  339.          *
  340.          * @param args
  341.          *            not used
  342.          */
  343.         // public static void main(String[] args) {
  344.         // out.println(howValid("kellizer@.hotmail.com"));
  345.         // }
  346. }
  347.