python regexp - ghdrako/doc_snipets GitHub Wiki

import re

print all([ # all of these are true, because
 not re.match("a", "cat"), # * 'cat' doesn't start with 'a'
 re.search("a", "cat"), # * 'cat' has an 'a' in it
 not re.search("c", "dog"), # * 'dog' doesn't have a 'c' in it
 3 == len(re.split("[ab]", "carbs")), # * split on a or b to ['c','r','s']
 "R-D-" == re.sub("[0-9]", "-", "R2D2") # * replace digits with dashes
 ]) # prints True


import re
regex = re.compile(r"[,_]")
separated_words2 = regex.split(messy_data)

OOP style regex

import re
regex = re.compile("do")
regex.pattern
regex.search("do homework")
regex.findall("don't do that")

Function style

import re
re.search("pattern", "the string to be searched")
re.findall("pattern", "the string to be searched")

Behind the scenes, when we call re.search , Python creates the Pattern object for us and calls the search method on the pattern.

In OOP style when you use the compile function to create a Pattern object, the compiled pattern is cached in such a way that it’s more efficient to use the pattern multiple times because there is no need to compile the pattern the second time.

By contrast, the functional approach creates the pattern on the fly, so it doesn’t have the benefit of improved efficiency of the cached pattern.

Creating the pattern with a raw string

task_pattern = re.compile("\\\\task")   # \\task  - both backshlash must be escaped
texts = ["\task", "\\task", "\\\task", "\\\\task"]
for text in texts:
print(f"Match {text!r}: {task_pattern.match(text)}")
# output the following lines:
Match '\task': None
Match '\\task': <re.Match object; span=(0, 5), match='\\task'>
Match '\\\task': None
Match '\\\\task': None
task_pattern_r = re.compile(r"\\task")    # with raw string no need to escape special characters
texts = ["\task", "\\task", "\\\task", "\\\\task"]
for text in texts:
print(f"Match {text!r}: {task_pattern_r.match(text)}")
# output the following lines:
Match '\task': None
Match '\\task': <re.Match object; span=(0, 5), match='\\task'>
Match '\\\task': None
Match '\\\\task': None

Character classes

  • \d any decimal digit
  • \D any character that is not a decimal digit
  • \s any whitespace, including space, \t, \n, \r, \f, \v
  • \S any character that isn't a whitespace
  • \w any word character, means alphanumeric plus underscores
  • \W any character that is not a word character
  • . any character except a newline
  • [] a set of defined characters

[] to define a character set:

  • individual characters [abcxyz]
  • range of characters. [a-z] [A-Z]
  • ombine different ranges of characters [a-dw-z] a-d and w-z

LOGICAL OPERATORS

  • a|b a or b
  • (abc) abc as a group
  • [^a] any character other than a
match = re.search(r"(\w\d)+", "xyza2b1c3dd")
print(match)
# output: <re.Match object; span=(3, 9), match='a2b1c3'>

print("matched:", match.group())
# output: matched: a2b1c3
print("span:", match.span())
# output: span: (3, 9)
print(f"start: {match.start()} & end: {match.end()}")
# output: start: 3 & end: 9
match = re.match("pattern", "string to match")
if match:
print("do something with the matched")
else:
print("found no matches")

WORKING WITH MULTIPLE GROUPS

match = re.match(r"(\w+), (\w+)", "Homework, urgent; today")
print(match)
# output: <re.Match object; span=(0, 16), match='Homework, urgent'>
match.groups()
# output: ('Homework', 'urgent')
match.group(0)
# output: 'Homework, urgent'
match.group(1)
# output: 'Homework'
match.group(2)
# output: 'urgent'

match.span(0)
# output: (0, 16)
match.span(1)
# output: (0, 8)
match.span(2)
# output: (10, 16)
Method Code example Match/return value
search: Returns a Match if a match is found anywhere in the string. re.search(r"\d+", "ab12xy") '12'
_ re.search(r"\d+", "abxy") None
match: Returns a Match only if a match is found at the string’s beginning. re.match(r"\d+", "ab12xy") None
_ re.match(r"\d+", "12abxy") '12'
findall: Returns a list of strings that match the pattern. When the pattern has multiple groups, the item is a tuple. re.findall(r"h[ie]\w", "hi hey hello") ['hey', 'hel']
_ re.findall(r"(h|H)(i|e)","Hey hello") [('H', 'e'), ('h', 'e')]
finditer: Returns an iterator b that yields the Match objects. re.finditer(r"(h|H)(i|e)","hi Hey hello") An iterator
split: Splits the string by the pattern. re.split(r"\d+",'a1b2c3d4e') ['a', 'b', 'c', 'd', 'e']
sub: Creates a string by replacing the matched with the replacement. re.sub(r"\D", "-",'123,456_789') '123-456-789'

Extracting data from individual groups

regex = re.compile(r"(\d{3}), (\w+); (.+)")
tasks = []
for line in text_data.split("\n"):
match = regex.match(line)
if match:
task = (match.group(1), match.group(2), match.group(3)) # Creates a tuple from multiple groups
tasks.append(task)
print(tasks)
# output the following line
[('101', 'Homework', 'Complete physics and math'),
➥ ('102', 'Laundry', 'Wash all the clothes today'),
➥ ('103', 'Museum', 'All about Egypt')]

Using named groups for text processing

To name a group, you use the syntax (?P<group_name>pattern) , in which you name the pattern group as group_name . The name should be a valid Python identifier because you must be able to retrieve it by calling the name.

regex = re.compile(r"(?P<task_id>\d{3}), (?P<task_title>\w+);
(?P<task_desc>.+)")
tasks = []
for line in text_data.split("\n"):
match = regex.match(line)
if match:
task = (match.group('task_id'), match.group('task_title'),
➥ match.group('task_desc'))
tasks.append(task)
⚠️ **GitHub.com Fallback** ⚠️