Serratus SQL Database Management - ababaian/serratus GitHub Wiki
serratus-aurora
is the main instance. It is an Aurora Serverless instance and can be scaled up to meet ingestion demand during batch uploads. This instance "sleeps" at 0 capacity units while inactive.
Periodically, a publicly accessible, always-on, Aurora Provisioned instance is created from a snapshot of serratus-aurora
. See Creating a Public Instance.
Tables/Views
- Serratus summary info. Created by serratus-summary-uploader.
- Nucleotide
- nsra
- nfamily
- nsequence
- Protein
- psra
- pfamily
- protein
- psequence
- RdRP
- rsra
- rphylum
- rfamily
- rsequence
- Nucleotide
- BioSample/SRA metadata. Created by biosample-sql and sraruninfo-sql.
- biosample: all BioSamples (as of 2021/02/23) with geospatial data extracted if available
- biosample_geocode: internal table for mapping geospatial text values to x/y coordinate values
- biosample_geo_coordinates: all from biosample with x/y coordinates combined with biosample_geocode
- srarun: metadata for all SRA run accessions in the Serratus search space
- srarun_geo_coordinates: all from srarun with x/y coordinates from biosample_geo_coordinates
- rdrp_pos: uploaded from Artem's rdrp_pos.csv
- analysis_index_uploaded_cols: uploaded from Artem's index for
assembly
,micro
Creating Users
serratus
is the main user. Password stored in Secrets Manager. This user is used for table curation and creating the other users below.
-- revoke default access for all users
REVOKE ALL ON SCHEMA public FROM public;
-- read-only group
CREATE ROLE viewer NOSUPERUSER NOINHERIT NOCREATEDB NOCREATEROLE NOREPLICATION;
GRANT CONNECT ON DATABASE summary TO viewer;
GRANT USAGE ON SCHEMA public TO viewer;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO viewer; -- re-run any time a table/view is replaced
-- users
CREATE USER public_reader WITH PASSWORD 'serratus' IN ROLE viewer;
CREATE USER tantalus WITH PASSWORD 'serratus' IN ROLE viewer;
CREATE USER web_api WITH PASSWORD 'serratus' IN ROLE viewer;
Creating Views
Nucleotide
CREATE MATERIALIZED VIEW nfamily_counts AS
SELECT family_name, score, percent_identity, COUNT(*)
FROM nfamily
GROUP BY family_name, score, percent_identity;
CREATE MATERIALIZED VIEW nsequence_counts AS
SELECT sequence_accession, score, percent_identity, COUNT(*)
FROM nsequence
GROUP BY sequence_accession, score, percent_identity;
CREATE MATERIALIZED VIEW nfamily_list AS
SELECT family_name
FROM nfamily
GROUP BY family_name
ORDER BY family_name;
CREATE MATERIALIZED VIEW nsequence_list AS
SELECT sequence_accession, virus_name
FROM nsequence
GROUP BY sequence_accession, virus_name
ORDER BY sequence_accession;
CREATE MATERIALIZED VIEW analysis_index AS
SELECT
run_id,
s.run_id IS NOT NULL AS srarun,
nsra.run_id IS NOT NULL AS nsra,
psra.run_id IS NOT NULL AS psra,
sgc.run_id IS NOT NULL AS geo,
upl.micro IS TRUE AS micro,
upl.assembly IS TRUE AS assembly
FROM (SELECT run AS run_id FROM srarun) AS s
FULL JOIN nsra USING (run_id)
FULL JOIN rsra USING (run_id)
FULL JOIN psra USING (run_id)
FULL JOIN srarun_geo_coordinates sgc USING (run_id)
FULL JOIN analysis_index_uploaded_cols upl USING (run_id);
RdRP
CREATE MATERIALIZED VIEW rphylum_counts AS
SELECT phylum_name, score, percent_identity, COUNT(*)
FROM rphylum
GROUP BY phylum_name, score, percent_identity;
CREATE MATERIALIZED VIEW rfamily_counts AS
SELECT family_name, score, percent_identity, COUNT(*)
FROM rfamily
GROUP BY family_name, score, percent_identity;
CREATE MATERIALIZED VIEW rsequence_counts AS
SELECT sequence_accession, score, percent_identity, COUNT(*)
FROM rsequence
GROUP BY sequence_accession, score, percent_identity;
CREATE MATERIALIZED VIEW rphylum_list AS
SELECT phylum_name
FROM rphylum
GROUP BY phylum_name
ORDER BY phylum_name;
CREATE MATERIALIZED VIEW rfamily_list AS
SELECT family_name
FROM rfamily
GROUP BY family_name
ORDER BY family_name;
CREATE MATERIALIZED VIEW rsequence_list AS
SELECT sequence_accession, virus_name
FROM rsequence
GROUP BY sequence_accession, virus_name
ORDER BY sequence_accession;
Metadata
CREATE MATERIALIZED VIEW biosample_geo_coordinates AS
SELECT biosample_id,
coordinate_x,
coordinate_y,
b.geo_text_extracted AS from_text
FROM biosample b
INNER JOIN biosample_geocode bgeo
ON (b.geo_text_extracted = bgeo.geo_text_extracted)
WHERE coordinate_x IS NOT NULL
UNION ALL
SELECT biosample_id,
geo_coordinate_x AS coordinate_x,
geo_coordinate_y AS coordinate_y,
NULL AS FROM_text
FROM biosample
WHERE geo_coordinate_x IS NOT NULL;
CREATE MATERIALIZED VIEW srarun_geo_coordinates AS
SELECT run AS run_id,
bio_sample AS biosample_id,
release_date,
coordinate_x, coordinate_y, from_text
FROM srarun
INNER JOIN biosample_geo_coordinates bgeo
ON (srarun.bio_sample = bgeo.biosample_id);
When referenced tables are replaced, these views should be dropped and recreated. REFRESH MATERIALIZED VIEW
doesn't seem to work in this case.
Sometimes, biosample_geocode
may be uploaded with text data type for numeric columns. If that happens, run this before creating views:
ALTER TABLE biosample_geocode
ALTER COLUMN coordinate_x TYPE DOUBLE PRECISION,
ALTER COLUMN coordinate_y TYPE DOUBLE PRECISION;
Creating Indexes
Indexes are used for optimizing queries.
Nucleotide
CREATE INDEX nfamily_run_id_index ON nfamily (run_id);
CREATE INDEX nfamily_family_name_index ON nfamily (family_name);
CREATE INDEX nfamily_score_index ON nfamily (score);
CREATE INDEX nfamily_percent_identity_index ON nfamily (percent_identity);
CREATE INDEX nsequence_run_id_index ON nsequence (run_id);
CREATE INDEX nsequence_sequence_accession_index ON nsequence (sequence_accession);
CREATE INDEX nsequence_score_index ON nsequence (score);
CREATE INDEX nsequence_percent_identity_index ON nsequence (percent_identity);
CREATE INDEX nsequence_sequence_accession_score_index ON nsequence (sequence_accession, score);
-- views
CREATE INDEX nfamily_counts_family_name_index ON nfamily_counts (family_name);
CREATE INDEX nsequence_counts_sequence_accession_index ON nsequence_counts (sequence_accession);
RdRP
CREATE INDEX rphylum_run_id_index ON rphylum (run_id);
CREATE INDEX rphylum_phylum_name_index ON rphylum (phylum_name);
CREATE INDEX rphylum_score_index ON rphylum (score);
CREATE INDEX rphylum_percent_identity_index ON rphylum (percent_identity);
CREATE INDEX rfamily_run_id_index ON rfamily (run_id);
CREATE INDEX rfamily_family_name_index ON rfamily (family_name);
CREATE INDEX rfamily_score_index ON rfamily (score);
CREATE INDEX rfamily_percent_identity_index ON rfamily (percent_identity);
CREATE INDEX rsequence_run_id_index ON rsequence (run_id);
CREATE INDEX rsequence_sequence_accession_index ON rsequence (sequence_accession);
CREATE INDEX rsequence_score_index ON rsequence (score);
CREATE INDEX rsequence_percent_identity_index ON rsequence (percent_identity);
-- views
CREATE INDEX rphylum_counts_phylum_name_index ON rphylum_counts (phylum_name);
CREATE INDEX rfamily_counts_family_name_index ON rfamily_counts (family_name);
CREATE INDEX rsequence_counts_sequence_accession_index ON rsequence_counts (sequence_accession);
Metadata
CREATE INDEX srarun_run_index ON srarun (run);
CREATE INDEX srarun_bio_sample_index ON srarun (bio_sample);
CREATE INDEX biosample_geo_coordinates_biosample_id_index ON biosample_geo_coordinates (biosample_id);
CREATE INDEX srarun_geo_coordinates_run_id_index ON srarun_geo_coordinates (run_id);
CREATE INDEX rdrp_pos_run_id_index ON rdrp_pos (run_id);
CREATE INDEX analysis_index_run_id_index ON analysis_index (run_id);
Creating a Public Instance
export ORIGINAL_CLUSTER_ID="serratus-aurora"
export SNAPSHOT_ID="serratus-aurora-20210315"
export RESTORE_CLUSTER_ID="serratus-aurora-20210315"
export RESTORE_INSTANCE_ID="serratus-aurora-20210315-main"
# create snapshot
aws rds create-db-cluster-snapshot \
--db-cluster-snapshot-identifier $SNAPSHOT_ID \
--db-cluster-identifier $ORIGINAL_CLUSTER_ID
# restore from snapshot
aws rds restore-db-cluster-from-snapshot \
--db-cluster-identifier $RESTORE_CLUSTER_ID \
--snapshot-identifier $SNAPSHOT_ID \
--db-subnet-group-name default-vpc-025ef5ccc841b5b86 \
--engine-mode provisioned \
--engine aurora-postgresql \
--vpc-security-group-ids sg-07e2ad8dccb1d4ba6 \
--engine-version 10.14
aws rds create-db-instance \
--db-instance-identifier $RESTORE_INSTANCE_ID \
--db-instance-class db.t3.medium \
--engine aurora-postgresql \
--db-cluster-identifier $RESTORE_CLUSTER_ID \
--publicly-accessible