Python regular expression

Shorthand character class:

\d - Any numeric digit from 0 to 9.
\D - Any character that is not a numeric digit from 0 to 9.
\w - Any letter, numeric digit, or the underscore character. (Think of this as matching “word” characters.)
\W - Any character that is not a letter, numeric digit, or the underscore character.
\s - Any space, tab, or newline character. (Think of this as matching “space” characters.)
\S - Any character that is not a space, tab, or newline.




.       - Any Character Except New Line

\d      - Digit (0-9)

\D      - Not a Digit (0-9)

\w      - Word Character (a-z, A-Z, 0-9, _)

\W      - Not a Word Character

\s      - Whitespace (space, tab, newline)

\S      - Not Whitespace (space, tab, newline)



\b      - Word Boundary

\B      - Not a Word Boundary

^       - Beginning of a String

$       - End of a String



[]      - Matches Characters in brackets

[^ ]    - Matches Characters NOT in brackets

|       - Either Or

( )     - Group



Quantifiers:

*       - 0 or More

+       - 1 or More

?       - 0 or One

{3}     - Exact Number

{3,4}   - Range of Numbers (Minimum, Maximum)





#### Sample Regexs ####



[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

[0-5] is the same as: (0|1|2|3|4|5)

Regexp example:


import re



text_to_search = 'abc defg hij abcd efg hijk'



pattern = re.compile(r'abc')

matches = pattern.finditer(text_to_search)

# print('Phone number found: ' + test.group())



for match in matches:

    print(match)

    

# <re.Match object; span=(0, 3), match='abc'>

# <re.Match object; span=(13, 16), match='abc'>





urls = '''

https://www.google.com

http://youtube.com

Home Page


'''



pattern_url = re.compile(r'https?://(www\.)?(\w+)(\.\w+)') # optional 's' and 'www'

subbed_urls = pattern_url.sub(r'\2\3', urls) # replaces matches with group 2 and 3

print(subbed_urls) # google.com youtube.com nasa.gov



matches = pattern_url.finditer(urls)



for match in matches:

    print(match.group(2))

# google

# youtube

# nasa





matches = pattern_url.findall(urls)



for match in matches:

    print(match)



# ('www.', 'google', '.com')

# ('', 'youtube', '.com')

# ('www.', 'nasa', '.gov')


>>> import re

>>> phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

>>> test = phoneNumRegex.search('Phone number is 333-555-7777.')

>>> print('Phone number found: ' + test.group())

Phone number found: 333-555-7777


>>> import re

>>> phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

>>> mo = phoneNumRegex.search('My number is 415-555-4242.')

>>> mo.group(1)

'415'

>>> mo.group(2)

'555-4242'

>>> mo.group(0)

'415-555-4242'

>>> mo.group()

'415-555-4242'


>>> import re

>>> mo.groups()

('415', '555-4242')

>>> areaCode, mainNumber = mo.groups()

>>> print(areaCode)

415

>>> print(mainNumber)

555-4242


>>> import re

>>> phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')

>>> mo = phoneNumRegex.search('My phone number is (415) 555-4242.')

>>> mo.group(1)

'(415)'

>>> mo.group(2)

'555-4242'


>>> import re

>>> batRegex = re.compile(r'Bat(man|mobile|copter|bat)')

>>> mo = batRegex.search('Batmobile lost a wheel')

>>> mo.group()

'Batmobile'

>>> mo.group(1)

'mobile'


>>> batRegex = re.compile(r'Bat(wo){0,1}man')

>>> mo1 = batRegex.search('The Adventures of Batman')

>>> mo1.group()

'Batman'

>>> mo2 = batRegex.search('The Adventures of Batwoman')

>>> mo2.group()

'Batwoman'


>>> batRegex = re.compile(r'Bat(wo){0,}man')

>>> mo1 = batRegex.search('The Adventures of Batman')

>>> mo1.group()

'Batman'

>>> mo2 = batRegex.search('The Adventures of Batwoman')

>>> mo2.group()

'Batwoman'

>>> mo3 = batRegex.search('The Adventures of Batwowowowoman')

>>> mo3.group()

'Batwowowowoman'


>>> haRegex = re.compile(r'(Yo){3}')

>>> re1 = haRegex.search('YoYoYo')

>>> re1.group()

'YoYoYo'

>>> re2 = haRegex.search('Yo')

>>> re2 == None

True



>>> haRegex = re.compile(r'(Yo){2,4}')  # Same as: ((Yo)(Yo))|((Yo)(Yo)(Yo))|((Yo)(Yo)(Yo)(Yo))

>>> re1 = haRegex.search('YoYoYo')

>>> re1.group()

'YoYoYo'


>>> phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups

>>> phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']


>>> phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups

>>> phoneNumRegex.findall('Cell: 444-555-9999 Work: 333-555-0000')

[('444', '555', '1122'), ('333', '555', '0000')]


>>> fruRegex = re.compile(r'\d+\s\w+')

>>> fruRegex.findall('5 bananas, 7 apples, 9 oranges')

['5 bananas', '7 apples', '9 oranges']

Custom character classes


>>> vowelRegex = re.compile(r'[aeiouAEIOU]')

>>> vowelRegex.findall('ABC DE. abc de.')

['A', 'E', 'a', 'e']

String begins with custom characters


>>> beginsWithHello = re.compile(r'^Hello')

>>> beginsWithHello.search('Hello world!')

<_sre.SRE_Match object; span=(0, 5), match='Hello'>

>>> beginsWithHello.search('He said hello.') == None

True

String ends with custom characters


>>> endsWithNumber = re.compile(r'\d$')

>>> endsWithNumber.search('Your number is 42')

<_sre.SRE_Match object; span=(16, 17), match='2'>

>>> endsWithNumber.search('Your number is forty two.') == None

True

String begins and ends with custom characters


>>> wholeStringIsNum = re.compile(r'^\d+$')

>>> wholeStringIsNum.search('1234567890')

<_sre.SRE_Match object; span=(0, 10), match='1234567890'>

>>> wholeStringIsNum.search('12345xyz67890') == None

True

>>> wholeStringIsNum.search('12 34567890') == None

True

"Any character" wildcard


>>> atRegex = re.compile(r'.at')

>>> atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

"Matching Everything" wildcard


>>> nameRegex = re.compile(r'First Name: (.{0,}) Last Name: (.{0,})')

>>> mo = nameRegex.search('First Name: Jack Last Name: Fox')

>>> mo.group(1)

'Jack'

>>> mo.group(2)

'Fox'

Greedy and nongreedy matching


>>> nongreedyRegex = re.compile(r'<.*?>')

>>> mo = nongreedyRegex.search('<To serve man> for dinner.>')

>>> mo.group()

'<To serve man>'

>>> greedyRegex = re.compile(r'<.*>')

>>> mo = greedyRegex.search('<To serve man> for dinner.>')

>>> mo.group()

'<To serve man> for dinner.>'

Matching Newlines with the Dot Character


>>> noNewlineRegex = re.compile('.*')

>>> noNewlineRegex.search('Serve the public trust.\nProtect the innocent.

\nUphold the law.').group()

'Serve the public trust.'

>>> newlineRegex = re.compile('.*', re.DOTALL)

>>> newlineRegex.search('Serve the public trust.\nProtect the innocent.

\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

Strings Substitution with Regular Expression


>>> namesRegex = re.compile(r'Agent \w+')

>>> namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

Strings Substitution with Regular Expression - Extended


>>> agentNamesRegex = re.compile(r'Agent (\w)\w*')

>>> agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent

Eve knew Agent Bob was a double agent.')

A**** told C**** that E**** knew B**** was a double agent.'

Custom character classes

String begins with custom characters

String ends with custom characters

String begins and ends with custom characters

"Any character" wildcard

"Matching Everything" wildcard

Greedy and nongreedy matching

Matching Newlines with the Dot Character

Strings Substitution with Regular Expression

Strings Substitution with Regular Expression - Extended

Leave a Comment Cancel reply