Data profile queries - IntegratedBreedingPlatform/Documentation GitHub Wiki

Following are some queries that will help in getting a picture of data profiles in BMS crop databases.

Trials and Nurseries Metadata

Query takes a while (minutes) to run on big crop databases but produces a useful result containing metadata (plot count, trial/nursery name/ folder & program it belongs to etc.

select count(ndep.nd_experiment_id) as experiment_count, ndep.project_id as datasetId, p.name as datasetName, folderInfo.folder, dataSetInfo.mainStudyId, dataSetInfo.mainStudyName, wbproj.project_name as workbenchProgramName
	from nd_experiment_project  ndep
    left outer join project p on p.project_id = ndep.project_id    

    
    left outer join (
		select p1.project_id as datasetId, p1.name as dataSetName, p1.description, p2.project_id as mainStudyId, p2.name as mainStudyName 
        
			from project_relationship pr
	
			left outer join project p1 on p1.project_id = pr.subject_project_id
			left outer join project p2 on p2.project_id = pr.object_project_id
    
			where pr.type_id = 1150 -- (is a dataset of relationship)
    
    ) 
    as dataSetInfo on dataSetInfo.datasetId = p.project_id
    
	left outer join (
    
		select p1.project_id, p2.name as folder from project_relationship pr
	
			left outer join project p1 on p1.project_id = pr.subject_project_id
			left outer join project p2 on p2.project_id = pr.object_project_id
			where pr.type_id = 1145 -- (the folder relationship)
	) 
	as folderInfo on folderInfo.project_id = dataSetInfo.mainStudyId
    
    left outer join workbench.workbench_project wbproj on wbproj.project_uuid = p.program_uuid
    
    group by ndep.project_id order by experiment_count desc;

Row count of all tables

NOTE: Replace ibdbv2_maize_merged with the crop database you want the row counts for. Example below is for ibdbv2_maize_merged.

SELECT table_name, table_rows
  FROM INFORMATION_SCHEMA.TABLES
     WHERE TABLE_SCHEMA = "ibdbv2_maize_merged";

Size of database

SELECT table_schema AS db, 
   ROUND(SUM(data_length + index_length) / 1024 / 1024, 2) AS size_mb 
   FROM  information_schema.TABLES 
     GROUP BY table_schema;

Stats Queries

These can also be found at

## Please first run mysqlcheck on the entire database. DO NOT SKIP this step. In order to run the mysqlcheck please do the following.
## Open up a command prompt. Navigate to C:\BMS4\infrastructure\mysql\bin
## Run the following command. Note this could take a while on a large database. When asked for a password press enter i.e. no password.
mysqlcheck.exe -u root -p --host localhost --port 43306 --all-databases

## After the above command has finished please open up a Mysql Workbench and run the following query.
## Export results as a CSV and send them back to Leafnode. Please name the file institution_name.csv
## Please CHANGE /tmp/ to a path on your computer where you want to save the script. Note file with be found at that path
SELECT
     table_schema as `Database`,
     table_name AS `Table`,
     round(((data_length + index_length) / 1024 / 1024), 2) `Size in MB`, table_rows
FROM information_schema.TABLES
WHERE table_schema = "workbench" or table_schema LIKE 'ibdbv2%'
INTO OUTFILE '/tmp/INSTITUTION_NAME_database_size_overview.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

## Please CHANGE /tmp/ to a path on your computer and change the INSTITUTION_NAME_CROP_NAME to the name of the instituion and crop you are running the scripts against.
## For example if you were running against CIAT rice database you might change /tmp to C:\\BMS4\\ and INSTITUTION_NAME to CIAT and CROP_NAME to RICE.
## Thus your resulting path prefix will look like C:\\BMS4\\CIAT_RICE
## Note file with be found at that path specifed


## Total number of trials and Nurseries for each program
Select wp.project_name, p.name, cv.name, gpSiteName.value as sitename FROM
	workbench.workbench_project wp
INNER JOIN
    project p ON wp.project_uuid = p.program_uuid
        INNER JOIN
    projectprop ppStudy ON p.project_id = ppStudy.project_id
        AND ppStudy.type_id = 8070
        AND ppStudy.value IN (10000 , 10010)
    INNER JOIN cvterm cv on   ppStudy.value = cv.cvterm_id
    LEFT JOIN
    nd_experiment_project ep ON p.project_id = ep.project_id
        LEFT JOIN
    nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
        LEFT JOIN
    nd_geolocationprop gpSiteName ON e.nd_geolocation_id = gpSiteName.nd_geolocation_id
        AND gpSiteName.type_id = 8180

GROUP BY wp.project_name, p.name, cv.name
ORDER BY wp.project_name, p.name, cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_trials_and_nurseries_for_each_program.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

## Number of environments per nursery or trial
## Note nursery will always show 0 in the number of environments

SELECT
    p.name AS ProjectName,
    cv.name AS StudyType,
    envp.name AS EnvironmentProjectName,
    ep.project_id EnvironementProjectId,
    COUNT(gpSiteId.value) AS NumberOfEnvironements
FROM
    nd_experiment_project ep
        INNER JOIN
    nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
        LEFT JOIN
    nd_geolocationprop gpSiteId ON e.nd_geolocation_id = gpSiteId.nd_geolocation_id
        AND gpSiteId.type_id = 8190
        LEFT JOIN
    location l ON l.locid = gpSiteId.value
        LEFT JOIN
    project envp ON envp.project_id = ep.project_id
        LEFT JOIN
    project p ON p.name = SUBSTRING(envp.name,
        1,
        CHAR_LENGTH(envp.name) - 12)
        LEFT JOIN
    projectprop ppStudy ON p.project_id = ppStudy.project_id
        AND ppStudy.type_id = 8070
        AND ppStudy.value IN (10000 , 10010)
        INNER JOIN
    cvterm cv ON ppStudy.value = cv.cvterm_id
WHERE
    envp.name LIKE '%-ENVIRONMENT'
GROUP BY envp.name , cv.name , envp.name , ep.project_id
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_environments_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

## Number of traits per nursery or trial
SELECT
    p.name AS projectName,
    cv.name AS StudyType,
    COUNT(cvt.name) AS numberOfTraits
FROM
    project pdp
        LEFT JOIN
    projectprop pp ON pdp.project_id = pp.project_id
        AND pp.type_id = 1808
        LEFT JOIN
    cvterm cvt ON cvt.name = pp.value
        LEFT JOIN
    project p ON p.name = SUBSTRING(pdp.name,
        1,
        CHAR_LENGTH(pdp.name) - 9)
        LEFT JOIN
    projectprop ppStudy ON p.project_id = ppStudy.project_id
        AND ppStudy.type_id = 8070
        AND ppStudy.value IN (10000 , 10010)
        LEFT JOIN
    cvterm cv ON ppStudy.value = cv.cvterm_id
GROUP BY p.name , cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_traits_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Number of plots for each trial/nursery
SELECT
    p.name,
    pdp.name,
    cv.name AS StudyType,
    COUNT(e.nd_experiment_id)
FROM
    nd_experiment_project ep
        LEFT JOIN
    nd_experiment e ON ep.nd_experiment_id = e.nd_experiment_id
        LEFT JOIN
    project pdp ON pdp.project_id = ep.project_id
        LEFT JOIN
    project p ON p.name = SUBSTRING(pdp.name,
        1,
        CHAR_LENGTH(pdp.name) - 9)
        LEFT JOIN
    projectprop ppStudy ON p.project_id = ppStudy.project_id
        AND ppStudy.type_id = 8070
        AND ppStudy.value IN (10000 , 10010)
        LEFT JOIN
    cvterm cv ON ppStudy.value = cv.cvterm_id
WHERE
    pdp.name LIKE '%-PLOTDATA'
GROUP BY p.name , pdp.name , cv.name
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_number_of_plots_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# List size for a trial or nursery include advanced and cross lists
SELECT
    p.name, lms.listtype, COUNT(lp.listdata_project_id)
FROM
    project p
        INNER JOIN
    listnms lms ON lms.projectid = p.project_id
        LEFT JOIN
    listdata_project lp ON lp.list_id = lms.listid
GROUP BY p.name , lms.listtype
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_list_size_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Folder size
SELECT
    lms.listname, COUNT(lmsc.lhierarchy)
FROM
    listnms lms
        LEFT JOIN
    listnms lmsc ON lms.listid = lmsc.lhierarchy
WHERE
    lms.listtype = 'FOLDER'
GROUP BY lms.listname
UNION SELECT
    'Top Level', COUNT(*)
FROM
    listnms lms
WHERE
    lms.listtype NOT IN ('FOLDER')
        AND lms.projectid IS NULL
        AND lms.lhierarchy IS NULL
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_folder_size_per_study.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Variable Count
SELECT count(distinct name) As variableCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_variable_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Property Name Count
SELECT count(distinct property_name) As propertyCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_property_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Method Name Count
SELECT count(distinct method_name) As methodCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_method_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Scale Name Count
SELECT count(distinct scale_name) As scaleCount FROM standard_variable_summary
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_scale_count.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# List of all variales and their data type
SELECT svs.name as variableName, svs.property_name, svs.method_name, svs.scale_name, t.name as data_type  FROM   standard_variable_summary svs
 LEFT JOIN  cvterm_relationship r on r.subject_id = scale_id and r.type_id = 1105
        LEFT JOIN
    cvterm t ON r.object_id = t.cvterm_id
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_variable_data_with_datatype.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

# Persentage of categorial variables
SELECT
    (SELECT
            COUNT(t.name)
        FROM
            standard_variable_summary svs
                LEFT JOIN
            cvterm_relationship r ON r.subject_id = scale_id
                AND r.type_id = 1105
                LEFT JOIN
            cvterm t ON r.object_id = t.cvterm_id
        WHERE
            t.name = 'Categorical variable') / (SELECT
            COUNT(*)
        FROM
            standard_variable_summary svs
                LEFT JOIN
            cvterm_relationship r ON r.subject_id = scale_id
                AND r.type_id = 1105
                LEFT JOIN
            cvterm t ON r.object_id = t.cvterm_id) * 100 AS persentageOfCategoricalVariables
INTO OUTFILE '/tmp/INSTITUTION_NAME_CROP_NAME_categorical_variable_size.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';