VALIDATING AN ILLUMINA SAMPLE SHEET
import pandas as pd
csv_in = '2020-05-14_NovaSeq.csv'
CREATE A DataFrame CALLED df
FROM THE CSV
# Create a pandas DataFrame from CSV
# Formatting all data as strings since all data is qualitative
df = pd.read_csv(csv_in, dtype='str')
# Get rid of the empty "NA" rows that Excel loves to include...
df.dropna(inplace=True)
# Print some summary statistics using "f-strings"
print(f'Rows: {len(df)}')
print(f'Plates: {df.plate.nunique()}\n')
# Preview the first 5 rows
print(df[:5])
Rows: 384
Plates: 4
sample-number plate well rawSample-name sample-name
0 1 1 A01 15-4-SW-2-1 15-4-SW-2-1
1 2 1 A02 15-4-SW-3-2 15-4-SW-3-2
2 3 1 A03 17-100-SW-A 17-100-SW-A-1-1-37
3 4 1 A04 17-100-SW-A 17-100-SW-A-1-3-30
4 5 1 A05 17-100-SW-B 17-100-SW-B-1-1-30
CHECKING SAMPLE COUNT PER PLATE
- Each
plate
should have 96 rows
plates = df['plate'].value_counts()
bad_plates = plates[plates != 96]
try:
assert len(bad_plates) == 0
print(f'[PASS] All {len(plates)} plates have 96 samples!')
except AssertionError:
print(f'[FAIL] Found {len(bad_plates)} of {len(plates)}',
f'that do not have 96 samples...')
print(bad_plates)
[PASS] All 4 plates have 96 samples!
CHECKING FOR DUPLICATE SAMPLE NAMES
sample-name
should be unique
duplicates = df[df['sample-name'].duplicated()]
try:
assert len(duplicates) == 0
print('[PASS] All sample names are unique!')
except AssertionError:
print(f'[FAIL] Found {len(duplicates)} duplicate sample names...')
print(duplicates)
[PASS] All sample names are unique!
COMPARING THE SAMPLE NAME WITH THE RAW SAMPLE NAME
sample-name
should start with rawSample-name
bad_rows = df[[row['sample-name'].startswith(row['rawSample-name']) == False
for i, row in df.iterrows()]]
try:
assert len(bad_rows) == 0
print('[PASS] Every sample-name starts with its rawSample-name!')
except AssertionError:
print(f'[FAIL] Found {len(bad_rows)} rows',
'where sample-name does not contain rawSample-name')
[PASS] Every sample-name starts with its rawSample-name!
CHECKING THE LENGTH OF SAMPLE NAMES
sample-name
should be 100 characters or less
bad_rows = df[df['sample-name'].str.len() > 100]
try:
assert len(bad_rows) == 0
print('[PASS] All sample names are 100 characters or less!')
except AssertionError:
print(f'[FAIL] Found {len(bad_rows)} bad rows...\n')
print(bad_rows)
[PASS] All sample names are 100 characters or less!
CHECKING THE FORMATTING OF SAMPLE NAMES
sample-name
should only contain: [a-z], [A-Z], [0-9] or '-'
sample-name
should not start or end with '-'
pattern = r'^[a-zA-Z0-9]+[a-zA-Z0-9\-]+[a-zA-Z0-9]$'
bad_rows = df[df['sample-name'].str.match(pattern) == False]
try:
assert len(bad_rows) == 0
print('[PASS] All sample names are properly formatted!')
except AssertionError:
print(f'[FAIL] Found {len(bad_rows)} bad rows...')
print(bad_rows)
[PASS] All sample names are properly formatted!
Everything appears to be in order...