Python regular expression

Shorthand character class:

  • \d - Any numeric digit from 0 to 9.
  • \D - Any character that is not a numeric digit from 0 to 9.
  • \w - Any letter, numeric digit, or the underscore character. (Think of this as matching “word” characters.)
  • \W - Any character that is not a letter, numeric digit, or the underscore character.
  • \s - Any space, tab, or newline character. (Think of this as matching “space” characters.)
  • \S - Any character that is not a space, tab, or newline.



.       - Any Character Except New Line

\d      - Digit (0-9)

\D      - Not a Digit (0-9)

\w      - Word Character (a-z, A-Z, 0-9, _)

\W      - Not a Word Character

\s      - Whitespace (space, tab, newline)

\S      - Not Whitespace (space, tab, newline)



\b      - Word Boundary

\B      - Not a Word Boundary

^       - Beginning of a String

$       - End of a String



[]      - Matches Characters in brackets

[^ ]    - Matches Characters NOT in brackets

|       - Either Or

( )     - Group



Quantifiers:

*       - 0 or More

+       - 1 or More

?       - 0 or One

{3}     - Exact Number

{3,4}   - Range of Numbers (Minimum, Maximum)





#### Sample Regexs ####



[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

[0-5] is the same as: (0|1|2|3|4|5)

Regexp example:


import re



text_to_search = 'abc defg hij abcd efg hijk'



pattern = re.compile(r'abc')

matches = pattern.finditer(text_to_search)

# print('Phone number found: ' + test.group())



for match in matches:

    print(match)

    

# <re.Match object; span=(0, 3), match='abc'>

# <re.Match object; span=(13, 16), match='abc'>





urls = '''

https://www.google.com

http://youtube.com

Home Page
''' pattern_url = re.compile(r'https?://(www\.)?(\w+)(\.\w+)') # optional 's' and 'www' subbed_urls = pattern_url.sub(r'\2\3', urls) # replaces matches with group 2 and 3 print(subbed_urls) # google.com youtube.com nasa.gov matches = pattern_url.finditer(urls) for match in matches: print(match.group(2)) # google # youtube # nasa matches = pattern_url.findall(urls) for match in matches: print(match) # ('www.', 'google', '.com') # ('', 'youtube', '.com') # ('www.', 'nasa', '.gov')

>>> import re

>>> phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

>>> test = phoneNumRegex.search('Phone number is 333-555-7777.')

>>> print('Phone number found: ' + test.group())

Phone number found: 333-555-7777


>>> import re

>>> phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

>>> mo = phoneNumRegex.search('My number is 415-555-4242.')

>>> mo.group(1)

'415'

>>> mo.group(2)

'555-4242'

>>> mo.group(0)

'415-555-4242'

>>> mo.group()

'415-555-4242'


>>> import re

>>> mo.groups()

('415', '555-4242')

>>> areaCode, mainNumber = mo.groups()

>>> print(areaCode)

415

>>> print(mainNumber)

555-4242


>>> import re

>>> phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')

>>> mo = phoneNumRegex.search('My phone number is (415) 555-4242.')

>>> mo.group(1)

'(415)'

>>> mo.group(2)

'555-4242'


>>> import re

>>> batRegex = re.compile(r'Bat(man|mobile|copter|bat)')

>>> mo = batRegex.search('Batmobile lost a wheel')

>>> mo.group()

'Batmobile'

>>> mo.group(1)

'mobile'


>>> batRegex = re.compile(r'Bat(wo){0,1}man')

>>> mo1 = batRegex.search('The Adventures of Batman')

>>> mo1.group()

'Batman'

>>> mo2 = batRegex.search('The Adventures of Batwoman')

>>> mo2.group()

'Batwoman'


>>> batRegex = re.compile(r'Bat(wo){0,}man')

>>> mo1 = batRegex.search('The Adventures of Batman')

>>> mo1.group()

'Batman'

>>> mo2 = batRegex.search('The Adventures of Batwoman')

>>> mo2.group()

'Batwoman'

>>> mo3 = batRegex.search('The Adventures of Batwowowowoman')

>>> mo3.group()

'Batwowowowoman'


>>> haRegex = re.compile(r'(Yo){3}')

>>> re1 = haRegex.search('YoYoYo')

>>> re1.group()

'YoYoYo'

>>> re2 = haRegex.search('Yo')

>>> re2 == None

True



>>> haRegex = re.compile(r'(Yo){2,4}')  # Same as: ((Yo)(Yo))|((Yo)(Yo)(Yo))|((Yo)(Yo)(Yo)(Yo))

>>> re1 = haRegex.search('YoYoYo')

>>> re1.group()

'YoYoYo'


>>> phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups

>>> phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']


>>> phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups

>>> phoneNumRegex.findall('Cell: 444-555-9999 Work: 333-555-0000')

[('444', '555', '1122'), ('333', '555', '0000')]


>>> fruRegex = re.compile(r'\d+\s\w+')

>>> fruRegex.findall('5 bananas, 7 apples, 9 oranges')

['5 bananas', '7 apples', '9 oranges']

Custom character classes


>>> vowelRegex = re.compile(r'[aeiouAEIOU]')

>>> vowelRegex.findall('ABC DE. abc de.')

['A', 'E', 'a', 'e']

String begins with custom characters


>>> beginsWithHello = re.compile(r'^Hello')

>>> beginsWithHello.search('Hello world!')

<_sre.SRE_Match object; span=(0, 5), match='Hello'>

>>> beginsWithHello.search('He said hello.') == None

True

String ends with custom characters


>>> endsWithNumber = re.compile(r'\d$')

>>> endsWithNumber.search('Your number is 42')

<_sre.SRE_Match object; span=(16, 17), match='2'>

>>> endsWithNumber.search('Your number is forty two.') == None

True

String begins and ends with custom characters


>>> wholeStringIsNum = re.compile(r'^\d+$')

>>> wholeStringIsNum.search('1234567890')

<_sre.SRE_Match object; span=(0, 10), match='1234567890'>

>>> wholeStringIsNum.search('12345xyz67890') == None

True

>>> wholeStringIsNum.search('12 34567890') == None

True

"Any character" wildcard


>>> atRegex = re.compile(r'.at')

>>> atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

"Matching Everything" wildcard


>>> nameRegex = re.compile(r'First Name: (.{0,}) Last Name: (.{0,})')

>>> mo = nameRegex.search('First Name: Jack Last Name: Fox')

>>> mo.group(1)

'Jack'

>>> mo.group(2)

'Fox'

Greedy and nongreedy matching


>>> nongreedyRegex = re.compile(r'<.*?>')

>>> mo = nongreedyRegex.search('<To serve man> for dinner.>')

>>> mo.group()

'<To serve man>'

>>> greedyRegex = re.compile(r'<.*>')

>>> mo = greedyRegex.search('<To serve man> for dinner.>')

>>> mo.group()

'<To serve man> for dinner.>'

Matching Newlines with the Dot Character


>>> noNewlineRegex = re.compile('.*')

>>> noNewlineRegex.search('Serve the public trust.\nProtect the innocent.

\nUphold the law.').group()

'Serve the public trust.'

>>> newlineRegex = re.compile('.*', re.DOTALL)

>>> newlineRegex.search('Serve the public trust.\nProtect the innocent.

\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

Strings Substitution with Regular Expression


>>> namesRegex = re.compile(r'Agent \w+')

>>> namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

Strings Substitution with Regular Expression - Extended


>>> agentNamesRegex = re.compile(r'Agent (\w)\w*')

>>> agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent

Eve knew Agent Bob was a double agent.')

A**** told C**** that E**** knew B**** was a double agent.'

Leave a Comment