Data profile queries - IntegratedBreedingPlatform/Documentation GitHub Wiki
Following are some queries that will help in getting a picture of data profiles in BMS crop databases.
Trials and Nurseries Metadata
Query takes a while (minutes) to run on big crop databases but produces a useful result containing metadata (plot count, trial/nursery name/ folder & program it belongs to etc.
select count(ndep.nd_experiment_id) as experiment_count, ndep.project_id as datasetId, p.name as datasetName, folderInfo.folder, dataSetInfo.mainStudyId, dataSetInfo.mainStudyName, wbproj.project_name as workbenchProgramName
from nd_experiment_project ndep
left outer join project p on p.project_id = ndep.project_id
left outer join (
select p1.project_id as datasetId, p1.name as dataSetName, p1.description, p2.project_id as mainStudyId, p2.name as mainStudyName
from project_relationship pr
left outer join project p1 on p1.project_id = pr.subject_project_id
left outer join project p2 on p2.project_id = pr.object_project_id
where pr.type_id = 1150 -- (is a dataset of relationship)
)
as dataSetInfo on dataSetInfo.datasetId = p.project_id
left outer join (
select p1.project_id, p2.name as folder from project_relationship pr
left outer join project p1 on p1.project_id = pr.subject_project_id
left outer join project p2 on p2.project_id = pr.object_project_id
where pr.type_id = 1145 -- (the folder relationship)
)
as folderInfo on folderInfo.project_id = dataSetInfo.mainStudyId
left outer join workbench.workbench_project wbproj on wbproj.project_uuid = p.program_uuid
group by ndep.project_id order by experiment_count desc;
Row count of all tables
NOTE: Replace ibdbv2_maize_merged with the crop database you want the row counts for. Example below is for ibdbv2_maize_merged.
SELECT table_name, table_rows
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = "ibdbv2_maize_merged";
Size of database
SELECT table_schema AS db,
ROUND(SUM(data_length + index_length) / 1024 / 1024, 2) AS size_mb
FROM information_schema.TABLES
GROUP BY table_schema;
Stats Queries
These can also be found at
- https://github.com/IntegratedBreedingPlatform/Incubator/blob/master/Database%20Profiling%20Queries/crop_agnostic_ibp_database_stats_with_export.sql
- https://github.com/IntegratedBreedingPlatform/Incubator/blob/master/Database%20Profiling%20Queries/crop_based_ibp_database_stats_with_export.sql
## Please first run mysqlcheck on the entire database. DO NOT SKIP this step. In order to run the mysqlcheck please do the following.
## Open up a command prompt. Navigate to C:\BMS4\infrastructure\mysql\bin
## Run the following command. Note this could take a while on a large database. When asked for a password press enter i.e. no password.
mysqlcheck.exe -u root -p --host localhost --port 43306 --all-databases
## After the above command has finished please open up a Mysql Workbench and run the following query.
## Export results as a CSV and send them back to Leafnode. Please name the file institution_name.csv
## Please CHANGE /tmp/ to a path on your computer where you want to save the script. Note file with be found at that path
SELECT
table_schema as `Database`,
table_name AS `Table`,
round(((data_length + index_length) / 1024 / 1024), 2) `Size in MB`, table_rows
FROM information_schema.TABLES
WHERE table_schema = "workbench" or table_schema LIKE 'ibdbv2%'
INTO OUTFILE '/tmp/INSTITUTION_NAME_database_size_overview.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
## Please CHANGE /tmp/ to a path on your computer and change the INSTITUTION_NAME_CROP_NAME to the name of the instituion and crop you are running the scripts against.
## For example if you were running against CIAT rice database you might change /tmp to C:\\BMS4\\ and INSTITUTION_NAME to CIAT and CROP_NAME to RICE.
## Thus your resulting path prefix will look like C:\\BMS4\\CIAT_RICE
## Note file with be found at that path specifed
## Total number of trials and Nurseries for each program
Select wp.project_name, p.name, cv.name, gpSiteName.value as sitename FROM
workbench.workbench_project wp
INNER JOIN
project p ON wp.project_uuid = p.program_uuid
INNER JOIN
projectprop ppStudy ON p.project_id = ppStudy.project_id
AND ppStudy.type_id = 8070
AND ppStudy.value IN (10000 , 10010)
INNER JOIN cvterm cv on ppStudy.value = cv.cvterm_id
LEFT JOIN
nd_experiment_project ep ON p.project_id = ep.project_id
LEFT JOIN
nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
LEFT JOIN
nd_geolocationprop gpSiteName ON e.nd_geolocation_id = gpSiteName.nd_geolocation_id
AND gpSiteName.type_id = 8180
GROUP BY wp.project_name, p.name, cv.name
ORDER BY wp.project_name, p.name, cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_trials_and_nurseries_for_each_program.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
## Number of environments per nursery or trial
## Note nursery will always show 0 in the number of environments
SELECT
p.name AS ProjectName,
cv.name AS StudyType,
envp.name AS EnvironmentProjectName,
ep.project_id EnvironementProjectId,
COUNT(gpSiteId.value) AS NumberOfEnvironements
FROM
nd_experiment_project ep
INNER JOIN
nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
LEFT JOIN
nd_geolocationprop gpSiteId ON e.nd_geolocation_id = gpSiteId.nd_geolocation_id
AND gpSiteId.type_id = 8190
LEFT JOIN
location l ON l.locid = gpSiteId.value
LEFT JOIN
project envp ON envp.project_id = ep.project_id
LEFT JOIN
project p ON p.name = SUBSTRING(envp.name,
1,
CHAR_LENGTH(envp.name) - 12)
LEFT JOIN
projectprop ppStudy ON p.project_id = ppStudy.project_id
AND ppStudy.type_id = 8070
AND ppStudy.value IN (10000 , 10010)
INNER JOIN
cvterm cv ON ppStudy.value = cv.cvterm_id
WHERE
envp.name LIKE '%-ENVIRONMENT'
GROUP BY envp.name , cv.name , envp.name , ep.project_id
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_environments_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
## Number of traits per nursery or trial
SELECT
p.name AS projectName,
cv.name AS StudyType,
COUNT(cvt.name) AS numberOfTraits
FROM
project pdp
LEFT JOIN
projectprop pp ON pdp.project_id = pp.project_id
AND pp.type_id = 1808
LEFT JOIN
cvterm cvt ON cvt.name = pp.value
LEFT JOIN
project p ON p.name = SUBSTRING(pdp.name,
1,
CHAR_LENGTH(pdp.name) - 9)
LEFT JOIN
projectprop ppStudy ON p.project_id = ppStudy.project_id
AND ppStudy.type_id = 8070
AND ppStudy.value IN (10000 , 10010)
LEFT JOIN
cvterm cv ON ppStudy.value = cv.cvterm_id
GROUP BY p.name , cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_traits_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Number of plots for each trial/nursery
SELECT
p.name,
pdp.name,
cv.name AS StudyType,
COUNT(e.nd_experiment_id)
FROM
nd_experiment_project ep
LEFT JOIN
nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
LEFT JOIN
project pdp ON pdp.project_id = ep.project_id
LEFT JOIN
project p ON p.name = SUBSTRING(pdp.name,
1,
CHAR_LENGTH(pdp.name) - 9)
LEFT JOIN
projectprop ppStudy ON p.project_id = ppStudy.project_id
AND ppStudy.type_id = 8070
AND ppStudy.value IN (10000 , 10010)
LEFT JOIN
cvterm cv ON ppStudy.value = cv.cvterm_id
WHERE
pdp.name LIKE '%-PLOTDATA'
GROUP BY p.name , pdp.name , cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_plots_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# List size for a trial or nursery include advanced and cross lists
SELECT
p.name, lms.listtype, COUNT(lp.listdata_project_id)
FROM
project p
INNER JOIN
listnms lms ON lms.projectid = p.project_id
LEFT JOIN
listdata_project lp ON lp.list_id = lms.listid
GROUP BY p.name , lms.listtype
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_list_size_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Folder size
SELECT
lms.listname, COUNT(lmsc.lhierarchy)
FROM
listnms lms
LEFT JOIN
listnms lmsc ON lms.listid = lmsc.lhierarchy
WHERE
lms.listtype = 'FOLDER'
GROUP BY lms.listname
UNION SELECT
'Top Level', COUNT(*)
FROM
listnms lms
WHERE
lms.listtype NOT IN ('FOLDER')
AND lms.projectid IS NULL
AND lms.lhierarchy IS NULL
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_folder_size_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Variable Count
SELECT count(distinct name) As variableCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_variable_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Property Name Count
SELECT count(distinct property_name) As propertyCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_property_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Method Name Count
SELECT count(distinct method_name) As methodCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_method_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Scale Name Count
SELECT count(distinct scale_name) As scaleCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_scale_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# List of all variales and their data type
SELECT svs.name as variableName, svs.property_name, svs.method_name, svs.scale_name, t.name as data_type FROM standard_variable_summary svs
LEFT JOIN cvterm_relationship r on r.subject_id = scale_id and r.type_id = 1105
LEFT JOIN
cvterm t ON r.object_id = t.cvterm_id
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_variable_data_with_datatype.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
# Persentage of categorial variables
SELECT
(SELECT
COUNT(t.name)
FROM
standard_variable_summary svs
LEFT JOIN
cvterm_relationship r ON r.subject_id = scale_id
AND r.type_id = 1105
LEFT JOIN
cvterm t ON r.object_id = t.cvterm_id
WHERE
t.name = 'Categorical variable') / (SELECT
COUNT(*)
FROM
standard_variable_summary svs
LEFT JOIN
cvterm_relationship r ON r.subject_id = scale_id
AND r.type_id = 1105
LEFT JOIN
cvterm t ON r.object_id = t.cvterm_id) * 100 AS persentageOfCategoricalVariables
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_categorical_variable_size.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';