Skip to content

Commit ab06a8b

Browse files
committed
fixed: is_email is now complaint with email specifications
1 parent ad3c497 commit ab06a8b

File tree

3 files changed

+146
-13
lines changed

3 files changed

+146
-13
lines changed

string_utils/_regex.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222

2323
URLS_RE = re.compile(r'({})'.format(URLS_RAW_STRING), re.IGNORECASE)
2424

25-
EMAILS_RAW_STRING = r'[a-zA-Z\d._+-]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}'
25+
ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
26+
27+
EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
2628

2729
EMAIL_RE = re.compile(r'^{}$'.format(EMAILS_RAW_STRING))
2830

string_utils/validation.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,22 @@ def is_url(input_string: Any, allowed_schemes: Optional[List[str]] = None) -> bo
200200
return valid
201201

202202

203+
# todo: fix me
204+
'''
205+
That limit is a maximum of 64 characters (octets)
206+
in the "local part" (before the "@") and a maximum of 255 characters
207+
(octets) in the domain part (after the "@") for a total length of 320
208+
characters. Systems that handle email should be prepared to process
209+
addresses which are that long, even though they are rarely
210+
encountered.
211+
'''
212+
213+
203214
def is_email(input_string: Any) -> bool:
204215
"""
205-
Check if a string is an email.
216+
Check if a string is a valid email.
206217
207-
By design, the implementation of this checking does not strictly follow the specification for a valid \
208-
email address, but instead it's based on real world cases in order to match more than 99% \
209-
of emails and catch user mistakes. For example the percentage sign "%" is a valid sign for an email, \
210-
but actually no one use it, instead if such sign is found in a string coming from user input (like a \
211-
web form) it's very likely that it's a mistake.
218+
Reference: https://tools.ietf.org/html/rfc3696#section-3
212219
213220
*Examples:*
214221
@@ -219,7 +226,36 @@ def is_email(input_string: Any) -> bool:
219226
:type input_string: str
220227
:return: True if email, false otherwise.
221228
"""
222-
return is_full_string(input_string) and EMAIL_RE.match(input_string) is not None
229+
# first simple "pre check": it must be a non empty string with max len 320 and cannot start with a dot
230+
if not is_full_string(input_string) or len(input_string) > 320 or input_string.startswith('.'):
231+
return False
232+
233+
try:
234+
# we expect 2 tokens, one before "@" and one after, otherwise we have an exception and the email is not valid
235+
head, tail = input_string.split('@')
236+
237+
# removes escaped spaces, so that later on the test regex will accept the string
238+
head = head.replace('\\ ', '')
239+
if head.startswith('"') and head.endswith('"'):
240+
head = head.replace(' ', '')[1:-1]
241+
242+
if head.endswith('.') or len(head) > 64 or len(tail) > 255:
243+
return False
244+
245+
# multiple consecutive dots are forbidden
246+
if '..' in head:
247+
return False
248+
249+
return EMAIL_RE.match(head + '@' + tail) is not None
250+
251+
except ValueError:
252+
# borderline case in which we have multiple "@" signs but the head part is correctly escaped
253+
if ESCAPED_AT_SIGN.search(input_string) is not None:
254+
# replace "@" with "a" in the head
255+
sanitized = ESCAPED_AT_SIGN.sub('a', input_string)
256+
return is_email(sanitized)
257+
258+
return False
223259

224260

225261
def is_credit_card(input_string: Any, card_type: str = None) -> bool:

tests/test_is_email.py

Lines changed: 100 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,9 @@ def test_domain_extension_should_be_letters_only_from_2_to_4_chars(self):
4444
self.assertFalse(is_email('me@foo.___'))
4545
self.assertFalse(is_email('me@foo.toolongext'))
4646

47-
def test_name_part_cannot_contain_bad_signs(self):
48-
self.assertFalse(is_email('#me#@foo.com'))
49-
self.assertFalse(is_email('me!@foo.com'))
50-
self.assertFalse(is_email('[][]@foo.com'))
51-
self.assertFalse(is_email('john%@john5music.net'))
47+
def test_name_part_cannot_contain_suqare_brackets(self):
48+
self.assertFalse(is_email('[myemail@foo.com'))
49+
self.assertFalse(is_email('my]email@foo.com'))
5250

5351
def test_domain_part_cannot_contain_bad_signs(self):
5452
self.assertFalse(is_email('me@#foo#.com'))
@@ -74,3 +72,100 @@ def test_should_accept_valid_emails(self):
7472
self.assertTrue(is_email('foo@domamin.subdomain.com'))
7573
self.assertTrue(is_email('is1email@domain.org'))
7674
self.assertTrue(is_email('UPPER_CASE_EMAIL@somesite.com'))
75+
76+
def test_max_email_length_is_respected(self):
77+
invalid_email = ('a' * 320) + '@gmail.com'
78+
self.assertFalse(is_email(invalid_email))
79+
80+
def test_local_part_length_is_respected(self):
81+
# max local part is 64 (before "@")
82+
invalid_email = ('a' * 65) + '@gmail.com'
83+
self.assertFalse(is_email(invalid_email))
84+
85+
def test_octects_part_length_is_respected(self):
86+
# max octets part is 255 (after "@")
87+
invalid_email = 'a@{}.com'.format(255 * 'x')
88+
self.assertFalse(is_email(invalid_email))
89+
90+
def test_plus_is_valid_char_in_local_part(self):
91+
self.assertTrue(is_email("my+mail@gmail.com"))
92+
93+
def test_minus_is_valid_char_in_local_part(self):
94+
self.assertTrue(is_email("my-mail@gmail.com"))
95+
96+
def test_slash_is_valid_char_in_local_part(self):
97+
self.assertTrue(is_email("my/mail@gmail.com"))
98+
99+
def test_back_slash_is_valid_char_in_local_part(self):
100+
self.assertTrue(is_email("my\\mail@gmail.com"))
101+
102+
def test_equal_is_valid_char_in_local_part(self):
103+
self.assertTrue(is_email("my=mail@gmail.com"))
104+
105+
def test_question_mark_is_valid_char_in_local_part(self):
106+
self.assertTrue(is_email("my?mail@gmail.com"))
107+
108+
def test_sharp_is_valid_char_in_local_part(self):
109+
self.assertTrue(is_email("my#mail@gmail.com"))
110+
111+
def test_dollar_is_valid_char_in_local_part(self):
112+
self.assertTrue(is_email("my$mail@gmail.com"))
113+
114+
def test_and_is_valid_char_in_local_part(self):
115+
self.assertTrue(is_email("my&mail@gmail.com"))
116+
117+
def test_asterisk_is_valid_char_in_local_part(self):
118+
self.assertTrue(is_email("my*mail@gmail.com"))
119+
120+
def test_apostrophe_is_valid_char_in_local_part(self):
121+
self.assertTrue(is_email("my'mail@gmail.com"))
122+
123+
def test_acute_accent_is_valid_char_in_local_part(self):
124+
self.assertTrue(is_email("my`mail@gmail.com"))
125+
126+
def test_percentage_is_valid_char_in_local_part(self):
127+
self.assertTrue(is_email("my%mail@gmail.com"))
128+
129+
def test_exclamation_mark_is_valid_char_in_local_part(self):
130+
self.assertTrue(is_email("my!mail@gmail.com"))
131+
132+
def test_caret_is_valid_char_in_local_part(self):
133+
self.assertTrue(is_email("my^mail@gmail.com"))
134+
135+
def test_pipe_is_valid_char_in_local_part(self):
136+
self.assertTrue(is_email("my|mail@gmail.com"))
137+
138+
def test_tilde_is_valid_char_in_local_part(self):
139+
self.assertTrue(is_email("my~mail@gmail.com"))
140+
141+
def test_curly_braces_are_valid_char_in_local_part(self):
142+
self.assertTrue(is_email("my{mail@gmail.com"))
143+
self.assertTrue(is_email("my}mail@gmail.com"))
144+
self.assertTrue(is_email("{mymail}@gmail.com"))
145+
146+
def test_local_part_cannot_start_with_period(self):
147+
self.assertFalse(is_email('.myemail@gmail.com'))
148+
149+
def test_local_part_cannot_end_with_period(self):
150+
self.assertFalse(is_email('myemail.@gmail.com'))
151+
152+
def test_local_part_cannot_have_multiple_consecutive_periods(self):
153+
self.assertFalse(is_email('my..email@gmail.com'))
154+
self.assertFalse(is_email('my.email...nope@gmail.com'))
155+
156+
def test_empty_spaces_are_allowed_only_if_escaped(self):
157+
self.assertFalse(is_email('my mail@gmail.com'))
158+
self.assertTrue(is_email('my\\ mail@gmail.com'))
159+
self.assertTrue(is_email('"my mail"@gmail.com'))
160+
161+
def test_local_part_can_be_quoted(self):
162+
self.assertTrue(is_email('"foo"@example.com'))
163+
164+
def test_with_quoted_string_multiple_at_are_accepted(self):
165+
self.assertTrue(is_email('"Abc@def"@example.com'))
166+
167+
def test_with_escape_multiple_at_are_accepted(self):
168+
self.assertTrue(is_email('Abc\\@def@example.com'))
169+
170+
def test_local_part_can_have_self_escape(self):
171+
self.assertTrue(is_email('Joe.\\\\Blow@example.com'))

0 commit comments

Comments
 (0)