From fa39507bf3893ad19e81667199f0a2d3a8887a2f Mon Sep 17 00:00:00 2001 From: Celia Michotey <celia.michotey@inra.fr> Date: Thu, 23 Feb 2023 11:13:50 +0100 Subject: [PATCH 1/3] Refactor code, improve log and add test for PG connection. --- etl_gnpis-core/extract-gnpis-core-brapi.sh | 174 +++++++++--------- .../gnpis-pg-to-json/count_extracted_data.sql | 5 +- .../max_id_by_document_type.sql | 47 +++-- 3 files changed, 111 insertions(+), 115 deletions(-) mode change 100755 => 100644 etl_gnpis-core/extract-gnpis-core-brapi.sh diff --git a/etl_gnpis-core/extract-gnpis-core-brapi.sh b/etl_gnpis-core/extract-gnpis-core-brapi.sh old mode 100755 new mode 100644 index 0bc1031..1f16f6c --- a/etl_gnpis-core/extract-gnpis-core-brapi.sh +++ b/etl_gnpis-core/extract-gnpis-core-brapi.sh @@ -1,148 +1,146 @@ #!/usr/bin/env bash + RED='\033[0;31m' GREEN='\033[0;32m' ORANGE='\033[0;33m' BOLD='\033[1m' RED_BOLD="${RED}${BOLD}" NC='\033[0m' # No format +export RED GREEN ORANGE BOLD RED_BOLD NC + +# shellcheck disable=SC2120,SC2162 +colorize() { + # -- #!/bin/bash#colorize by JohnnyB - boeroboy@gmail.com + RED='\033[0;31m' + NC='\033[0m' # No format + + while read line + do + echo -e "${RED}${line}${NC}" + done < "${1:-/dev/stdin}" +} -############### URGI BrAPI Extractor############## +############### URGI BrAPI Extractor ############### # URGI Internal Use # GNU philosophy: do a single step, but do it well. -# This does extraction only, no transformation, -# fully handled by plant-brapi-etl-faidare and its -# avatars. -# ################################################ +# Extraction only, no transformation +# (fully handled by plant-brapi-etl-faidare). +#################################################### -###### sources +### Sources #https://stackoverflow.com/questions/71825711/how-to-use-jq-to-format-array-of-objects-to-separated-list-of-key-values #https://stackoverflow.com/questions/54087481/assigning-an-array-parsed-with-jq-to-bash-script-array #https://lzone.de/cheat-sheet/jq -# Check jq installed ? - # load configuration source gnpis_params.cfg -#extract_page_size=10000 extract_page_size=10000 -# get MAX ID for all table +export sqlDir="./gnpis-pg-to-json/" +sqlMaxidFile="$sqlDir/max_id_by_document_type.sql" +sqlCountFile="$sqlDir/count_extracted_data.sql" -sqlMaxidFile="./gnpis-pg-to-json/max_id_by_document_type.sql" -sqlCountFile="./gnpis-pg-to-json/count_extracted_data.sql" -export extractFolderPaged="./data/json-page" -export extractFolder="./data/json" -#transformFolder="./data/json-bulk" -mkdir -p $extractFolder/INRAE-URGI -mkdir -p $extractFolderPaged/INRAE-URGI -rm -f $extractFolder/INRAE-URGI/* -rm -f $extractFolderPaged/INRAE-URGI/* -echo "Extracting data to ${extractFolder}" +export dataSource="INRAE-URGI" +export extractFolder="./data/json/$dataSource" +export extractFolderPaged="./data/json-page/$dataSource" +[ -d $extractFolder ] && rm -f $extractFolder/* || mkdir -p $extractFolder +[ -d $extractFolderPaged ] && rm -f $extractFolderPaged/* || mkdir -p $extractFolderPaged -# ############################ -# EXTRACT INRAE-URGI to INRAE-URGI-E -# ############################ + +############## +# EXTRACTION # +############## do_psql_extract() { declare -i page=$1 local docType=$2 - declare -i docCount=$3 - declare -i e_page_size=$4 - declare -i upper_limit=$((page + e_page_size -1)) + declare -i docMaxId=$3 + declare -i extract_page_size=$4 + declare -i upper_limit=$((page + extract_page_size -1)) source gnpis_params.cfg # this is VOODOO ! why is this needed ? #sleep $((RANDOM % 3)) - declare -i page_number=$((page / e_page_size )) - sqlFile="./gnpis-pg-to-json/${docType}.sql" - echo "psql from $page to $upper_limit : $docType $docCount " - psql_cmd="psql -f ${sqlFile} \ - -At --host ${host} -p ${port} \ - -U ${user} -d ${db} \ + declare -i page_number=$((page / extract_page_size )) + + #echo "psql from $page to $upper_limit / $docMaxId " + sqlFile="$sqlDir/${docType}.sql" + psql_cmd="psql -f ${sqlFile} -At \ + -h ${host} -p ${port} -U ${user} -d ${db} \ -v faidareURL=${faidareURL} \ -v gnpisBaseURL=${gnpisBaseURL} \ -v trialId=${trialId} \ -v startPageId=${page} \ -v endPageId=${upper_limit} \ -v FETCH_COUNT=${fetchCount} \ - -o ${extractFolderPaged}/INRAE-URGI/${docType}-${page_number}.json" + -o ${extractFolderPaged}/${docType}-${page_number}.json" #echo $psql_cmd $psql_cmd - } - export -f do_psql_extract -urgi_extract() { - local docType=$1 - local docCount=$2 - - echo " Extracting $docCount $docType " - echo - seq -f'%.0f' 0 $extract_page_size $(($docCount + 1)) |\ - parallel --link --bar do_psql_extract {} $docType $docCount $extract_page_size :::: - - echo "Extraction done for ${docType}" -} -export -f urgi_extract -echo " extracting MAXIDs using ${sqlMaxidFile} " -mockMaxIdCmd="echo 100" +# Test PG connection +eval "psql -h ${host} -p ${port} -d ${db} -U ${user} -c '\conninfo'" 2> >(colorize) +CODE=$? +[ $CODE -gt 0 ] && { echo -e "${RED_BOLD}Error when trying to connect to ${DB_NAME} DB. Check that your passfile is correclty filled. Exiting.${NC}" ; exit $CODE ; } +echo -e "${BOLD}Extracting data to ${extractFolder}.${NC}" for documentType in ${documentTypes[@]}; do - - # ### Count maxId ### - maxIdCmd=" psql -f ${sqlMaxidFile} \ - -At --host ${host} -p ${port} \ - -U ${user} -d ${db} \ - -v type=${documentType} " - docDbCount=$($maxIdCmd) + echo -e "\nManage $documentType" + + ### Get max ID ### + echo "* Get max ID" + maxIdCmd=" psql -f ${sqlMaxidFile} -At \ + -h ${host} -p ${port} -U ${user} -d ${db} \ + -v type=${documentType} " + docMaxId=$($maxIdCmd) #echo $maxIdCmd - # ### Extract ### - urgi_extract $documentType $docDbCount + ### Extract data ### + echo "* Extract data" + seq -f'%.0f' 0 $extract_page_size $(($docMaxId + 1)) |\ + parallel --link --bar do_psql_extract {} $documentType $docMaxId $extract_page_size :::: - - # ### Concat paginated output ### - cat ${extractFolderPaged}/INRAE-URGI/${documentType}-*.json > ${extractFolder}/INRAE-URGI/${documentType}.json + ### Concat paginated output ### + echo "* Concat paginated output" + cat ${extractFolderPaged}/${documentType}-*.json > ${extractFolder}/${documentType}.json done -#echo "Extracting URGI data for $docTypesMaxCount" - -#echo $docTypesMaxCount | jq -r 'to_entries[] |.key as $id | .value | to_entries[] | [ .key, .value ] |@sh' - -# ############################ -# VALIDATION -# ############################ +############## +# VALIDATION # +############## -# ############################ -# Count document in DB - -# DB vs E -# ------- +echo -e "\n${BOLD}Count data for validation.${NC}" for documentType in ${documentTypes[@]}; do - countCmd="psql -f ${sqlCountFile} \ - -At --host ${host} -p ${port} \ - -U ${user} -d ${db} \ + + ### Get DB count ### + countCmd="psql -f ${sqlCountFile} -At \ + -h ${host} -p ${port} -U ${user} -d ${db} \ -v type=${documentType} -v trialId=${trialId}" docTypeDbCount=$($countCmd) - docTypeFileCount=$(wc -l ${extractFolder}/INRAE-URGI/${documentType}.json | tr -d "[:alpha:][:blank:][:punct:]") - echo ${docTypeDbCount} ${docTypeFileCount} - if [ ${docTypeDbCount} -eq ${docTypeFileCount} ]; - then - echo "Extraction validated for ${documentType} with database: ${docTypeDbCount} and file: ${docTypeFileCount} " + #echo "docTypeDbCount = ${docTypeDbCount}" + + ### Get file count ### + docTypeFileCount=$(wc -l ${extractFolder}/${documentType}.json | tr -d "[:alpha:][:blank:][:punct:]") + #echo "docTypeFileCount = ${docTypeFileCount}" + + if [ ${docTypeDbCount} -eq ${docTypeFileCount} ]; then + echo -e "${GREEN}Extraction validated for ${documentType} (${docTypeFileCount} documents)${NC}" else - echo -e "${RED_BOLD} ERROR ${NC} Extraction of ${documentType} with database: ${docTypeDbCount} and file: ${docTypeFileCount} " + echo -e "${RED_BOLD}ERROR: Extraction failed for ${documentType} (database = ${docTypeDbCount} and file = ${docTypeFileCount})${NC}" fi - #KO test - docTypeFileCount=$((docTypeFileCount + 1)) - if [ ${docTypeDbCount} -eq ${docTypeFileCount} ]; - then - echo -e "${RED_BOLD} ERROR ${NC} in the testing code " + + ### KO test ### + docTypeFileCount=$((docTypeFileCount + 1)) + if [ ${docTypeDbCount} -eq ${docTypeFileCount} ]; then + echo -e "${RED_BOLD}ERROR in the testing code${NC}" else - echo "TEST PROCEDURE OK " + echo -e "${GREEN}TEST PROCEDURE OK${NC}" fi -done -# E vs ET +done exit 1 + diff --git a/etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql b/etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql index a1f21e2..6595049 100644 --- a/etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql +++ b/etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql @@ -4,7 +4,7 @@ select CASE WHEN :'type' = 'germplasm' THEN (select count(distinct a.accession_id) from accession_t a) - WHEN :'type' = 'germplasmMcpd' THEN + WHEN :'type' = 'germplasmMcpd' THEN (select count(distinct a.accession_id) from accession_t a) WHEN :'type' = 'germplasmAttribute' THEN (select count(distinct a.accession_id) from accession_descriptor_t a) @@ -32,5 +32,4 @@ select CASE ) as studies) WHEN :'type' = 'trial' THEN (select count(distinct trial_set_id) from trial_set_t) - END -as count; +END as count; diff --git a/etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql b/etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql index 5f0ab81..1173293 100644 --- a/etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql +++ b/etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql @@ -3,35 +3,34 @@ select CASE WHEN :'type' = 'germplasm' THEN - (select max( a.accession_id) from accession_t a) - WHEN :'type' = 'germplasmMcpd' THEN - (select max( a.accession_id) from accession_t a) + (select max(a.accession_id) from accession_t a) + WHEN :'type' = 'germplasmMcpd' THEN + (select max(a.accession_id) from accession_t a) WHEN :'type' = 'germplasmAttribute' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max( germplasm.accession_id) - -- FROM accession_t germplasm - -- WHERE exists (select 1 from accession_descriptor_t ad where germplasm.accession_id = ad.accession_id)) + (select 10) --default min page size, no pagination implemented for that type + --(select max(a.accession_id) from accession_descriptor_t a) WHEN :'type' = 'germplasmPedigree' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max(genealogy_id) from genealogy_t ) + (select 10) --default min page size, no pagination implemented for that type + --(select max(genealogy_id) from genealogy_t) WHEN :'type' = 'germplasmProgeny' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max( a.accession_id) from accession_t a - --WHERE exists (select 1 from genealogy_t ge where ge.first_parent_id = a.accession_id or ge.second_parent_id = a.accession_id)) + (select 10) --default min page size, no pagination implemented for that type + --(select max(a.accession_id) from accession_t a + --where exists (select 1 from genealogy_t g where g.first_parent_id = a.accession_id or g.second_parent_id = a.accession_id)) WHEN :'type' = 'location' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max( site_id) from site_t) + (select 10) --default min page size, no pagination implemented for that type + --(select max(site_id) from site_t) WHEN :'type' = 'program' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max( project_id) from project_t) + (select 10) --default min page size, no pagination implemented for that type + --(select max(project_id) from project_t) WHEN :'type' = 'observationUnit' THEN - (select max(study_subject_id) from study_subject_t ) + (select max(study_subject_id) from study_subject_t) WHEN :'type' = 'study' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max(id) from (select max(trial_id) as id from trial_t - -- UNION select max(genotyping_experiment_id) as id from genotyping_experiment_t) as MAXIT) + (select 10) --default min page size, no pagination implemented for that type + --(select max(id) from ( + --select max(trial_id) as id from trial_t + --union select max(genotyping_experiment_id) as id from genotyping_experiment_t + --) as MAXID) WHEN :'type' = 'trial' THEN - (select 10 from ontology_t limit 1) --default min page size, no pagination implemented for that type - --(select max( trial_set_id) from trial_set_t) - END -as count; + (select 10) --default min page size, no pagination implemented for that type + --(select max(trial_set_id) from trial_set_t) +END as maxID; -- GitLab From 20c559dace205f011b1babfca665bbf9cfc3c49d Mon Sep 17 00:00:00 2001 From: Celia Michotey <celia.michotey@inra.fr> Date: Thu, 13 Apr 2023 13:49:25 +0200 Subject: [PATCH 2/3] Import GnpIS-core ETL for DataDiscovery, rename GnpIS-core ETL for BrAPI. --- .../extract-gnpis-core-brapi.sh | 0 .../gnpis-pg-to-json/count_extracted_data.sql | 0 .../gnpis-pg-to-json/germplasm.sql | 0 .../gnpis-pg-to-json/germplasmAttribute.sql | 0 .../gnpis-pg-to-json/germplasmMcpd.sql | 0 .../gnpis-pg-to-json/germplasmPedigree.sql | 0 .../gnpis-pg-to-json/germplasmProgeny.sql | 0 .../gnpis-pg-to-json/location.sql | 0 .../max_id_by_document_type.sql | 0 .../gnpis-pg-to-json/observationUnit.sql | 0 .../gnpis-pg-to-json/program.sql | 0 .../gnpis-pg-to-json/study.sql | 0 .../gnpis-pg-to-json/trial.sql | 0 .../gnpis_params.cfg | 6 +- etl_gnpis-core_dd/convert_to_json.sh | 75 ++++ etl_gnpis-core_dd/csv2json.jq | 63 +++ etl_gnpis-core_dd/csv_manipulator.py | 48 +++ etl_gnpis-core_dd/extract_gnpis-core.sh | 380 ++++++++++++++++++ .../extract_observation_variables.jq | 29 ++ etl_gnpis-core_dd/map_values_to_json.jq | 28 ++ .../sql/count_extracted_data.sql | 83 ++++ ...ransplant_gnpis_association_extraction.sql | 85 ++++ ...ant_gnpis_genetic_resources_extraction.sql | 146 +++++++ ...transplant_gnpis_genotyping_extraction.sql | 103 +++++ .../transplant_gnpis_mapping_extraction.sql | 277 +++++++++++++ ...ransplant_gnpis_phenotyping_extraction.sql | 89 ++++ .../transplant_gnpis_sequences_extraction.sql | 102 +++++ .../transplant_gnpis_synteny_extraction.sql | 149 +++++++ ...nsplant_gnpis_transcriptome_extraction.sql | 163 ++++++++ etl_gnpis-core_dd/variables_enrichment.sh | 151 +++++++ 30 files changed, 1974 insertions(+), 3 deletions(-) rename {etl_gnpis-core => etl_gnpis-core_brapi}/extract-gnpis-core-brapi.sh (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/count_extracted_data.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/germplasm.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/germplasmAttribute.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/germplasmMcpd.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/germplasmPedigree.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/germplasmProgeny.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/location.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/max_id_by_document_type.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/observationUnit.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/program.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/study.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis-pg-to-json/trial.sql (100%) rename {etl_gnpis-core => etl_gnpis-core_brapi}/gnpis_params.cfg (56%) create mode 100755 etl_gnpis-core_dd/convert_to_json.sh create mode 100644 etl_gnpis-core_dd/csv2json.jq create mode 100644 etl_gnpis-core_dd/csv_manipulator.py create mode 100755 etl_gnpis-core_dd/extract_gnpis-core.sh create mode 100644 etl_gnpis-core_dd/extract_observation_variables.jq create mode 100644 etl_gnpis-core_dd/map_values_to_json.jq create mode 100644 etl_gnpis-core_dd/sql/count_extracted_data.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_association_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_genetic_resources_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_genotyping_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_mapping_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_phenotyping_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_sequences_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_synteny_extraction.sql create mode 100644 etl_gnpis-core_dd/sql/transplant_gnpis_transcriptome_extraction.sql create mode 100755 etl_gnpis-core_dd/variables_enrichment.sh diff --git a/etl_gnpis-core/extract-gnpis-core-brapi.sh b/etl_gnpis-core_brapi/extract-gnpis-core-brapi.sh similarity index 100% rename from etl_gnpis-core/extract-gnpis-core-brapi.sh rename to etl_gnpis-core_brapi/extract-gnpis-core-brapi.sh diff --git a/etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/count_extracted_data.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/count_extracted_data.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/count_extracted_data.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/germplasm.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/germplasm.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/germplasm.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/germplasm.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/germplasmAttribute.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmAttribute.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/germplasmAttribute.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmAttribute.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/germplasmMcpd.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmMcpd.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/germplasmMcpd.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmMcpd.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/germplasmPedigree.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmPedigree.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/germplasmPedigree.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmPedigree.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/germplasmProgeny.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmProgeny.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/germplasmProgeny.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/germplasmProgeny.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/location.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/location.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/location.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/location.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/max_id_by_document_type.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/max_id_by_document_type.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/max_id_by_document_type.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/observationUnit.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/observationUnit.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/observationUnit.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/observationUnit.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/program.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/program.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/program.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/program.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/study.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/study.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/study.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/study.sql diff --git a/etl_gnpis-core/gnpis-pg-to-json/trial.sql b/etl_gnpis-core_brapi/gnpis-pg-to-json/trial.sql similarity index 100% rename from etl_gnpis-core/gnpis-pg-to-json/trial.sql rename to etl_gnpis-core_brapi/gnpis-pg-to-json/trial.sql diff --git a/etl_gnpis-core/gnpis_params.cfg b/etl_gnpis-core_brapi/gnpis_params.cfg similarity index 56% rename from etl_gnpis-core/gnpis_params.cfg rename to etl_gnpis-core_brapi/gnpis_params.cfg index 79a90bf..70670f6 100644 --- a/etl_gnpis-core/gnpis_params.cfg +++ b/etl_gnpis-core_brapi/gnpis_params.cfg @@ -1,14 +1,14 @@ host='localhost' db='aster' -port=54322 +port=5432 user='aster' fetchCount=1000 # production -documentTypes=("germplasm" "observationUnit" "location" "germplasmAttribute" "germplasmPedigree" "germplasmProgeny" "program" "study" "trial") +documentTypes=("germplasm" "germplasmAttribute" "germplasmPedigree" "germplasmProgeny" "location" "program" "study" "trial" "observationUnit") # tests -#documentTypes=("germplasm" "location" "germplasmMcpd" "observationUnit" "germplasmAttribute" "germplasmPedigree" "germplasmProgeny" "program" "study" "trial") +#documentTypes=("germplasm" "germplasmMcpd" "germplasmAttribute" "germplasmPedigree" "germplasmProgeny" "location" "program" "study" "trial" "observationUnit") #documentTypes=("trial") # Restrict the generated documents to be linked to a specific GnpIS trial aka MIAPPE study diff --git a/etl_gnpis-core_dd/convert_to_json.sh b/etl_gnpis-core_dd/convert_to_json.sh new file mode 100755 index 0000000..075fd09 --- /dev/null +++ b/etl_gnpis-core_dd/convert_to_json.sh @@ -0,0 +1,75 @@ +#!/bin/bash + + +RED='\033[0;31m' +GREEN='\033[0;32m' +ORANGE='\033[0;33m' +BOLD='\033[1m' +RED_BOLD="${RED}${BOLD}" +NC='\033[0m' # No format +export RED GREEN ORANGE BOLD RED_BOLD NC + +# shellcheck disable=SC2120,SC2162 +colorize() { + # -- #!/bin/bash#colorize by JohnnyB - boeroboy@gmail.com + RED='\033[0;31m' + NC='\033[0m' # No format + + while read line + do + echo -e "${RED}${line}${NC}" + done < "${1:-/dev/stdin}" +} +export -f colorize + +[ -z $VERBOSE ] && VERBOSE=0 + +SCRIPT_DIR=$(readlink -f "$(dirname "$0")") + +help() { + echo "Usages (using command line options, or using environment variables):" + echo "$0 -node DD_NODE -data DATA_DIR [-verbose {1,2}]" + echo "DD_NODE=<node_name> DATA_DIR=</path_to_data_dir> VERBOSE=1 $0" +} + +# get params +while [ -n "$1" ]; do + case $1 in + -node) DD_NODE=$2;shift 2;; + -data) DATA_DIR=$(readlink -f "${2}");shift 2;; + -verbose) VERBOSE=$2;shift 2;; + -h) help; exit 0;; + --help) help; exit 0;; + --) shift;break;; + -*) echo -e "${RED}ERROR: Unknown option: $1${NC}" && echo && help && echo;exit 1;; + *) echo -e "${RED}ERROR: Number or arguments unexpected." && echo && help && echo;exit 1;; + esac +done + +[ -z "$DD_NODE" ] && { echo -e "${RED}ERROR: missing -node argument or DD_NODE env variable.${NC}" ; help ; exit 1; } +[ -z "$DATA_DIR" ] && { echo -e "${RED}ERROR: missing -data argument or DATA_DIR env variable.${NC}" ; help ; exit 1; } + +[ ! -d "$DATA_DIR" ] && { echo -e "${RED}ERROR: given argument is not a directory: $DATA_DIR${NC}" ; exit 1; } +[ ! -r "$DATA_DIR" ] && { echo -e "${RED}ERROR: given directory is not readable: $DATA_DIR${NC}" ; exit 1; } +[ $VERBOSE -ge 0 ] 2>/dev/null || { echo -e "${RED}ERROR: -verbose option must be a positive integer, not: $VERBOSE${NC}" ; exit 1 ; } + +# (readlink -f ${DATA_DIR}) +for FILE in $DATA_DIR/*.csv; do + [ $VERBOSE -ge 2 ] && echo "Matching file: ${FILE}" ; + [ -f $FILE ] && FOUND_FILE=TRUE +done; +[ "$FOUND_FILE" == "TRUE" ] || { echo "ERROR: no valid csv file found in $DATA_DIR" ; exit 4 ; } + +[ -z $SEPARATOR ] && SEPARATOR='\t' # using tabulation as default separator + +[ -z $HEADER ] && HEADER="entryType${SEPARATOR}databaseName${SEPARATOR}identifier${SEPARATOR}name${SEPARATOR}description${SEPARATOR}url${SEPARATOR}species" +[ $VERBOSE -ge 1 ] && echo -e "Using header:\n$HEADER" +export DD_NODE + +for CSV_FILE in $DATA_DIR/*.csv; do + FILE=$(basename $CSV_FILE .csv) + [ $VERBOSE -ge 1 ] && echo "Processing $FILE from $CSV_FILE" + parallel --pipe-part --block 10M "sed '1 i$HEADER' | jq -Rr -s -f ${SCRIPT_DIR}/csv2json.jq > $DATA_DIR/${DD_NODE}_${FILE}_{#}_all_species.json 2> >(colorize)" :::: $CSV_FILE +done + +[ $VERBOSE -ge 0 ] && echo "JSON files generated into $DATA_DIR" diff --git a/etl_gnpis-core_dd/csv2json.jq b/etl_gnpis-core_dd/csv2json.jq new file mode 100644 index 0000000..d8f91a4 --- /dev/null +++ b/etl_gnpis-core_dd/csv2json.jq @@ -0,0 +1,63 @@ +# USAGE: +# $ jq -Rr -s -f csv2json.jq $CSV > $JSON +# Requires jq 1.6+ + +# replaces the leading and trailing blank only once, only for strings +def trimq: + if type == "string" + then (.|sub("^ +";"") | sub(" +$";"")) + else . + end +; + +# to_array_if_needed/1 splits the string on comma separator +# only if header = species +def to_array_if_needed(header): + if type == "string" + and (.|index(",") != null) + and header == "species" + then ( . | [split(",")[]| trimq ] ) + else . + end + ; + +# objectify/1 takes an array of string values as inputs, converts +# numeric values to numbers, and packages the results into an object +# with keys specified by the "headers" array +def objectify(headers): + def tonumberq: tonumber? // .; + def tonullq: if . == "" then null else . end; + + . as $in + | reduce range(0; headers|length) as $i ( + {}; headers[$i] as $header + | .[headers[$i]] = ( + $in[$i] + | to_array_if_needed($header) + | tonumberq + | trimq + | tonullq + ) + ) + ; + +def csv2table: + def trim: sub("^ +";"") | sub(" +$";""); # remove all leading and trailing spaces + split("\n") + | map( + split("\t") + | map(trim) + ); + +def csv2json: + csv2table + | .[0] as $headers + | reduce (.[1:][] | select(length > 0) ) as $row ( + []; . + [ $row|objectify($headers) + | .node = if $ENV.DD_NODE == null then "[ERROR]: the environment variable DD_NODE is missing to specify the name of the data provider (ie. INRAE-URGI, EBI or else).\n" | halt_error(1) else $ENV.DD_NODE end + | .name = if (.name == null) then .identifier? else .name end + | del(.dbVersion,.dbId,.xref,.featureType,.sequenceId,.sequenceVersion,.startPosition,.endPosition,.map,.mapPosition,.authority,.trait,.traitId,.environment,.environmentId,.statistic,.unit,.genotype,.experimentType,.linkedResourcesID) | select(keys | length > 1)] + ) + ; + +csv2json | if length == 0 then empty else . end diff --git a/etl_gnpis-core_dd/csv_manipulator.py b/etl_gnpis-core_dd/csv_manipulator.py new file mode 100644 index 0000000..a015c2a --- /dev/null +++ b/etl_gnpis-core_dd/csv_manipulator.py @@ -0,0 +1,48 @@ +#!/usr/bin/python + +############################################################################## +# Script used to manipulate csv and replace value in this file related to key +# stored in dictionnary. +# +# +# Check '<script>.py -h' for usage help +# +# +# Author: F. PHILIPPE +# +# Copyright INRA-URGI 2017 +############################################################################## + +import sys,csv,re + +csv.field_size_limit(sys.maxsize) + +def createDict_from_csv(csvfile): + dicoID={} + with open(csvfile) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + if row['key'] not in dicoID: + dicoID[row['key']]=row["value"] + return(dicoID) + + +def _main(): + dicoID=createDict_from_csv(sys.argv[1]) + with open(sys.argv[3],"w") as out: + with open(sys.argv[2], "r") as file: + reader = csv.reader(file, delimiter=',') + for row in reader: + # To handle elasticsearch and solr indexation, it is necessary to know the columns' number in order to localize description field. + if len(row)<25: + string = row[4] + else: + string = row[6] + sequence=string + for cle in dicoID.keys(): + if(cle in string): + sequence=sequence.replace(cle, dicoID[cle]) + out.write("\""+"\",\"".join(row).replace(string,sequence)+"\"\n") + +if __name__ == "__main__": + _main() diff --git a/etl_gnpis-core_dd/extract_gnpis-core.sh b/etl_gnpis-core_dd/extract_gnpis-core.sh new file mode 100755 index 0000000..81f0bb6 --- /dev/null +++ b/etl_gnpis-core_dd/extract_gnpis-core.sh @@ -0,0 +1,380 @@ +#!/bin/bash +# +# extract_gnpis-core.sh +# +# Script used to extract a thematic data from GnpIS-core. +# +# Author: E. Kimmel, R. Flores, C. Michotey, D. Charruaud +# + + +### colorization +RED='\033[0;31m' +GREEN='\033[0;32m' +ORANGE='\033[0;33m' +BOLD='\033[1m' +RED_BOLD="${RED}${BOLD}" +NC='\033[0m' # No format +export RED GREEN ORANGE BOLD RED_BOLD NC + +# shellcheck disable=SC2120,SC2162 +colorize() { + # -- #!/bin/bash#colorize by JohnnyB - boeroboy@gmail.com + RED='\033[0;31m' + NC='\033[0m' # No format + + while read line + do + echo -e "${RED}${line}${NC}" + done < "${1:-/dev/stdin}" +} +export -f colorize + + +CURRENT_DIR=$(readlink -f "$(dirname "$0")") +PASSFILE="${HOME}/.pgpass" +SQL_FILE="${CURRENT_DIR}/sql/count_extracted_data.sql" +export CURRENT_DIR PASSFILE SQL_FILE + +### default values +DATABASE="aster" +DB_HOST="shelob.versailles.inrae.fr" +DB_PORT="9121" +DB_USER="aster" + +GNPIS_LEGACY_URL="https://urgi.versailles.inrae.fr" +FAIDARE_URL="https://urgi.versailles.inrae.fr/faidare" + +SOURCE_NAME="GnpIS" +DD_NODE="INRAE-URGI" # DD_NODE is used by tabulated/csv2json.jq for setting correctly the node name + +OUTPUT="${CURRENT_DIR}/output" +VERBOSE=0 +EXTRACT="TRUE" +THEMATIC="" + +DEFAULT_THEMATICS=" \ +association \ +sequences \ +mapping \ +synteny \ +transcriptome \ +genetic_resources \ +genotyping \ +phenotyping \ +static" + +# genetic_resources, phenotyping and genotyping are now managed by FAIDARE ETL but are used for XREF +DD_THEMATICS=$(echo $DEFAULT_THEMATICS | sed -e "s/genetic_resources genotyping phenotyping //") +XREF_THEMATICS=$(echo $DEFAULT_THEMATICS | sed -e "s/mapping synteny transcriptome //") +export DEFAULT_THEMATICS DD_THEMATICS XREF_THEMATICS + + +help4me () { + cat <<EOF + +USAGE: + $0 [-thematic <see below...>] [-static <static_dir>] [-database <database>] [-db_user <database_user>] [-db_host <database_host>] [-db_port <database_port>] [-gnpis_legacy_url <GNPIS_LEGACY_URL>] [-faidare_url <FAIDARE_URL>] [-v[v]] + +DESCRIPTION: + Script used to extract data from GnpIS-core database. + +PARAMS: + -thematic thematics to process, by default all thematics are processed + -static static directory containing files following "gnpis_static_[source]_*.csv" name pattern, if you want to manage GnpIS static data (XXX@GnpIS) + -database the database to extract data from (DEFAULT: ${DATABASE}) + -db_user the user to connect to the database (DEFAULT: ${DB_USER}) + -db_host the host of the database to index (DEFAULT: ${DB_HOST}) + -db_port the port of the database to index (DEFAULT: ${DB_PORT}) + -gnpis_legacy_url prefix URL of the GnpIS legacy applications to point to (DEFAULT: ${GNPIS_LEGACY_URL}) + -faidare_url url of the FAIDARE application to point to (DEFAULT: ${FAIDARE_URL}) + --transform_only does not extract data, use data already present into output directory (${OUTPUT}) + -v display verbose informations + -vv display very verbose informations + -h or --help print this help + +WARNING: + If not given, the credentials of the user MUST be referenced in ${PASSFILE} using the database name as a suffix. + +AVAILABLE THEMATICS ARE: +${DEFAULT_THEMATICS} +EOF + exit 1 +} + +check_error() { + CODE=$1 + if [[ $CODE -ne 0 ]];then + echo -e "${RED_BOLD}Error $CODE occured: should check that passfile is given and correclty filled (user, password for given database, etc.): ${PASSFILE}${NC}" + echo -e "${RED_BOLD}Exiting.${NC}" + exit 1 + fi +} +export -f check_error + +echo_and_eval_cmd() { + local CMD=$1 + if [ -z "$CMD" ]; then + echo -e "${RED_BOLD}Missing command to eval. Exiting.${NC}" + exit 1 + fi + [ $VERBOSE -ge 2 ] && echo -e "Going to exec command: \n\t${CMD}" + eval $CMD 2> >(colorize) + check_error $? +} +export -f echo_and_eval_cmd + +### get params +while [ -n "$1" ]; do + case $1 in + -h) help4me;shift 1;; + --help) help4me;shift 1;; + -thematic) THEMATICS=$(echo "$2" | tr ',' ' '); if [ -z "$2" ]; then shift 1; else shift 2; fi;; + -static) STATIC_DIR=$2;shift 2;; + -database) DATABASE=$2;shift 2;; + -db_user) DB_USER=$2; shift 2;; + -db_host) DB_HOST=$2;shift 2;; + -db_port) DB_PORT=$2;shift 2;; + -gnpis_legacy_url) GNPIS_LEGACY_URL="$2"; shift 2;; + -faidare_url) FAIDARE_URL="$2"; shift 2;; + --transform_only) EXTRACT="FALSE"; shift 1;; + -v) VERBOSE=1 ; VERBOSE_OPTION="-v" ; PARALLEL_VERBOSE="--bar" ; shift 1;; + -vv) VERBOSE=2 ; VERBOSE_OPTION="-vv" PARALLEL_VERBOSE_2="--bar" ; shift 1;; + --) shift;break;; + -*) echo && echo -e "${RED_BOLD}Unknown option: $1${NC}" && echo;exit 1;; + *) break;; + esac +done + +### check options +if [ "${EXTRACT}" == "TRUE" ] ; then + [ -z "$DATABASE" ] && echo -e "${RED_BOLD}-database is required.${NC}" && help4me && exit 1 + + # check password file + if [ ! -f "$PASSFILE" ]; then + echo -e "${RED_BOLD}The password file ($PASSFILE) does not exists.${NC}" + exit 2 + fi +else + [[ $(find "${OUTPUT}" -type f -name "*.csv" -ls | wc -l) -eq 0 ]] && echo -e "${RED_BOLD}No CSV found in ${OUTPUT}. Please provide a not empty directory." && help4me && exit 4 +fi + +if [ -z "$THEMATICS" ] || [ "$THEMATICS" == "all" ]; then + echo -e "${ORANGE}No thematics specified, using default list.${NC}" + THEMATICS=$DEFAULT_THEMATICS +else + for THEMATIC in ${THEMATICS}; do + if [ -z "$(echo ${THEMATICS} | grep ${THEMATIC})" ]; then + echo -e "${ORANGE}Unknown thematic ${THEMATIC} to extract. Ignoring it.${NC}" + fi + done + if [ "$(echo ${THEMATICS} | grep 'static')" ]; then + if [ -z "$STATIC_DIR" ]; then + echo -e "${RED_BOLD}Static directory is mandatory if you want to manage static data.${NC}" + exit 2 + elif [ ! -d "${STATIC_DIR}" ] || [ $(find ${STATIC_DIR} -type f -name "gnpis_static_*.csv" -ls | wc -l) -eq 0 ] ; then + echo -e "${RED_BOLD}You want to manage static data but static directory does not exist or it contains no files to manage ('gnpis_static_[source]_*.csv').${NC}" + exit 2 + fi + fi +fi + +[ ! -d "$OUTPUT" ] && mkdir "$OUTPUT" + +PG_CONNECT="-U ${DB_USER} -p ${DB_PORT} -h ${DB_HOST} -d ${DATABASE}" +echo_and_eval_cmd "psql -w ${PG_CONNECT} -c '\conninfo'" # .pgpass file must be correct such as: host:port:database:user:password + +# Need to export variables and functions for them to be available inside parallel command +export EXTRACT VERBOSE VERBOSE_OPTION PARALLEL_VERBOSE PARALLEL_VERBOSE_2 +export PG_CONNECT GNPIS_LEGACY_URL FAIDARE_URL SOURCE_NAME DD_NODE STATIC_DIR OUTPUT + + +extract_thematic(){ + local LOCAL_THEMATIC="$1" + local SQL_SCRIPT="sql/transplant_gnpis_${LOCAL_THEMATIC}_extraction.sql" + + local APPLICATION_URL="${GNPIS_LEGACY_URL}" + if [ "${LOCAL_THEMATIC}" == "genetic_resources" ]; then + APPLICATION_URL="${FAIDARE_URL}" + fi + + # check sql file + if [ ! -f "${SQL_SCRIPT}" ]; then + echo -e "${RED_BOLD}The SQL script (${SQL_SCRIPT}) does not exists.${NC}" + exit 2 + fi + + [ $VERBOSE -ge 2 ] && echo "Sql script is : ${SQL_SCRIPT}" + # Execute command to extract data + echo_and_eval_cmd "psql -q -w ${PG_CONNECT} -v source_name=${SOURCE_NAME} -v thematic=${LOCAL_THEMATIC} -v application_url=${APPLICATION_URL} -f ${SQL_SCRIPT}" +} +export -f extract_thematic + +check_extracted_data() { + local LOCAL_THEMATIC="$1" + local COUNT_STATUS=0 + + if [ -n "$(ls -1 -- *.csv)" ]; then + for FILE in "gnpis_${LOCAL_THEMATIC}"*.csv; do + TYPE=$(echo "$FILE" | sed -r 's/^gnpis_('"${LOCAL_THEMATIC}"'_.+)\.csv$/\1/' 2> >(colorize)) + + COUNT_DATA_DB=$(psql -w ${PG_CONNECT} -tA -v type="${TYPE}" -f "${SQL_FILE}" 2> >(colorize)) + COUNT_DATA_FILE=$(wc -l "$FILE" 2> >(colorize) | cut -d ' ' -f1 2> >(colorize)) + + if [ "$COUNT_DATA_DB" = "" ]; then + echo -e "${ORANGE}Warning: can not check data count for ${TYPE} (SQL query missing in count_extracted_data.sql script)${NC}" + elif [ "$COUNT_DATA_DB" != "$COUNT_DATA_FILE" ]; then + echo -e "${RED_BOLD}Error: expected ${COUNT_DATA_DB} data but got ${COUNT_DATA_FILE} extracted data for ${TYPE}${NC}" + ((COUNT_STATUS++)) + fi + + if [ "$COUNT_DATA_DB" = "0" ]; then + rm "$FILE" 2> >(colorize) + else + mv "$FILE" "${OUTPUT}"/ 2> >(colorize) + fi + done + if [[ $COUNT_STATUS -ne 0 ]] ; then + echo -e "${RED_BOLD}Errors detected, aborting extraction process.${NC}" + #exit 1 + fi + else + [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No csv files found...${NC}" + fi +} +export -f check_extracted_data + +enrich_csv(){ + CSV_FILE="$1" + CURRENT_THEMATIC="$2" + + if [ "${CURRENT_THEMATIC}" == "phenotyping" ]; then + [ $VERBOSE -ge 1 ] && echo "Enrich variables..." + [ $VERBOSE -ge 2 ] && echo "processing with file : ${CSV_FILE}..." + "${CURRENT_DIR}"/variables_enrichment.sh -f "${CSV_FILE}" "${VERBOSE_OPTION}" + fi +} +export -f enrich_csv + +transform_private_url(){ + # change URL to their private form when first field does not start by a zero (2 pass: 1 => gnpis legacy ; 2 => faidare) + # transformed files are written into `${OUTPUT}/privatised` sub-directory + + CSV_FILE="$1" + if [[ ! "${FAIDARE_URL}" =~ "https://urgi.versailles.inrae.fr/faidare-" ]] ; then # we have a production URL + sed 's/\t/ /g ; s/^"//g ; s/","/\t/g ; s/"$//g' "${CSV_FILE}" | sed -r "s#^([^0].*)(https://urgi.versailles.inrae.fr)(.*)#\1\2/private\3#g ; s#^([^0].*)(https://urgi.versailles.inrae.fr/private/faidare)(.*)#\1https://urgi.versailles.inrae.fr/faidare-private\3#g" > "${OUTPUT}/privatised/$(basename "$CSV_FILE")" + else + sed 's/\t/ /g ; s/^"//g ; s/","/\t/g ; s/"$//g' "${CSV_FILE}" > "${OUTPUT}/privatised/$(basename "$CSV_FILE")" + fi + # FAIDARE public/private is only for production env because faidare-int handles the groups via Apache. + # In case of int or staging env, the URL given in script parameter should be already handled by Apache withotu further modification. +} +export -f transform_private_url + +dd_convert_csv_to_json() { + [ ! -d "${OUTPUT}/data_discovery" ] && mkdir "${OUTPUT}/data_discovery" + [ $VERBOSE -ge 1 ] && echo "Transform CSV to DataDiscovery JSON..." + + for LOCAL_THEMATIC in $THEMATICS; do + if [ -z "$(echo ${DD_THEMATICS} | grep ${LOCAL_THEMATIC})" ]; then + echo -e "${ORANGE}Thematic ${LOCAL_THEMATIC} cannot be transformed in data-discovery format. Ignoring it.${NC}" + continue + fi + + if [ -n "$(ls -1 ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv)" ]; then + cp ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv ${OUTPUT}/data_discovery/ + parallel ${PARALLEL_VERBOSE_2} enrich_csv "{}" "${LOCAL_THEMATIC}" ::: "${OUTPUT}/data_discovery/gnpis_${LOCAL_THEMATIC}*.csv" + else + [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No CSV file matching gnpis_${LOCAL_THEMATIC}*.csv found...${NC}" + continue + fi + done + + export HEADER="groupId\tentryType\tdatabaseName\tidentifier\tname\tdescription\turl\tspecies" + export VERBOSE + [ $VERBOSE -ge 2 ] && echo "Running convert_to_json.sh" + "${CURRENT_DIR}"/convert_to_json.sh -data "${OUTPUT}/data_discovery" + + [ $? -ne 0 ] || echo -e "${GREEN}Transformation finished, DD JSON available into ${OUTPUT}/data_discovery!\n${NC}" +} +export -f dd_convert_csv_to_json + +xref_convert_csv_to_json() { + local LOCAL_THEMATIC="$1" + local FILENAME PREFIX CSV_LENGTH SPLIT_VALUE + + [ ! -d "${OUTPUT}/xref" ] && mkdir "${OUTPUT}/xref" + [ $VERBOSE -ge 1 ] && echo "Generate XREF JSON for ${LOCAL_THEMATIC} thematic..." + + if [ -z "$(echo ${XREF_THEMATICS} | grep ${LOCAL_THEMATIC})" ]; then + echo -e "${ORANGE}Thematic ${LOCAL_THEMATIC} cannot be transformed in xref format. Ignoring it.${NC}" + return 1 + fi + + if [ ! -n "$(ls -1 ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv)" ]; then + [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No CSV file matching gnpis_${LOCAL_THEMATIC}*.csv found...${NC}" + return 1 + fi + + for CSV_FILE in "${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}"*.csv; do + [ $VERBOSE -ge 2 ] && echo "processing with file : ${CSV_FILE}..." + PREFIX="xref-${LOCAL_THEMATIC}-" + CSV_LENGTH=$(wc -l < "$CSV_FILE") + SPLIT_VALUE=10000 + if [ "${CSV_LENGTH}" -gt "$SPLIT_VALUE" ];then + [ $VERBOSE -ge 2 ] && echo "Splitting processus..." + split -d -l $SPLIT_VALUE "${CSV_FILE}" "${PREFIX}" + for SPLIT_FILE in ${PREFIX}*; + do + CMD_JQ="jq --slurp --raw-input --raw-output --compact-output -f ${CURRENT_DIR}/map_values_to_json.jq '${SPLIT_FILE}' > ${OUTPUT}/xref/${SPLIT_FILE}.json" + echo_and_eval_cmd "${CMD_JQ}" + done + rm -v ${CURRENT_DIR}/${PREFIX}* + else + [ $VERBOSE -ge 2 ] && echo "No splitting processus needed..." + CMD_JQ="jq --slurp --raw-input --raw-output --compact-output -f ${CURRENT_DIR}/map_values_to_json.jq '${CSV_FILE}' > '${OUTPUT}/xref/${PREFIX}1.json'" + echo_and_eval_cmd "${CMD_JQ}" + fi + done + + [ $? -ne 0 ] || echo -e "${GREEN}Generation finished, ${LOCAL_THEMATIC} XREF JSON available into ${OUTPUT}/xref!\n${NC}" +} +export -f xref_convert_csv_to_json + +process_thematic() { + local LOCAL_THEMATIC="$1" + + if [ "${EXTRACT}" == "TRUE" ] && [ "${LOCAL_THEMATIC}" != "static" ]; then + [ $VERBOSE -ge 1 ] && echo "Process ${LOCAL_THEMATIC} thematic..." + extract_thematic "${LOCAL_THEMATIC}" + [ $VERBOSE -ge 1 ] && echo "Check extracted data..." + check_extracted_data "${LOCAL_THEMATIC}" + elif [ "${LOCAL_THEMATIC}" = "static" ]; then + [ $VERBOSE -ge 1 ] && echo "Process static files..." + cp ${STATIC_DIR}/gnpis_*.csv ${OUTPUT} + else + return 1 + fi + + echo -e "${GREEN}Extraction finished for ${LOCAL_THEMATIC} thematic!\n${NC}" +} +export -f process_thematic + +echo -e "\n${BOLD}Extract data...${NC}" +parallel -j4 ${PARALLEL_VERBOSE} process_thematic ::: ${THEMATICS} + +echo -e "\n${BOLD}Manage private data...${NC}" +[ ! -d "${OUTPUT}/privatised" ] && mkdir "${OUTPUT}/privatised" +if [ -n "$(ls -1 ${OUTPUT}/gnpis_*.csv)" ]; then + [ $VERBOSE -ge 1 ] && echo "Transform private URL..." + parallel ${PARALLEL_VERBOSE_2} transform_private_url ::: "${OUTPUT}/gnpis_"*.csv +else + [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No CSV file matching gnpis_*.csv found...${NC}" +fi +echo -e "${GREEN}Privatisation finished!${NC}" + +echo -e "\n${BOLD}Convert CSV to JSON...${NC}" +dd_convert_csv_to_json +parallel -j4 ${PARALLEL_VERBOSE} xref_convert_csv_to_json ::: ${THEMATICS} + +exit diff --git a/etl_gnpis-core_dd/extract_observation_variables.jq b/etl_gnpis-core_dd/extract_observation_variables.jq new file mode 100644 index 0000000..4b5c0d4 --- /dev/null +++ b/etl_gnpis-core_dd/extract_observation_variables.jq @@ -0,0 +1,29 @@ +#!/usr/bin/env jq -Mf +# Produces: +# [ +# { "BFF:0000001": "BFF:0000001 CH_cm (Canopy Height - Hauteur de canop�e)"}, +# { "BFF:0000005": "BFF:0000005 HMax_cm (Plant maximum height - Hauteur maximale plante)"} +# ] +# +# Then transform it to CSV format: +# BFF:0000001,BFF:0000001 CH_cm (Canopy Height - Hauteur de canop�e) +# BFF:0000005,BFF:0000005 HMax_cm (Plant maximum height - Hauteur maximale plante) +[ + (. | group_by(.observationVariableDbId|tostring) + |.[] + | + { (.[0].observationVariableDbId|tostring ) : + (.[0].observationVariableDbId + " " + .[0].name + # use first observationVariableDbId and name as they are the same for each entries grouped by observationVariableDbId + " (" + + ( + [ .[] | .synonyms? ] # create an array of all synonyms if present + | add # merge all synonyms in a unique array + | select (length > 0) # continue only if at least have 1 synonym + | join(" - ") # concat synonyms with a dash + ) + + ")" + ) + } + )| to_entries[] +] +| .[] | .key + "," + .value # transform to CSV format \ No newline at end of file diff --git a/etl_gnpis-core_dd/map_values_to_json.jq b/etl_gnpis-core_dd/map_values_to_json.jq new file mode 100644 index 0000000..f457189 --- /dev/null +++ b/etl_gnpis-core_dd/map_values_to_json.jq @@ -0,0 +1,28 @@ +#!/usr/bin/env jq -Mf +[ +split("\n") | .[] + | select(length>0) # ignore empty lines + | split("\t") # produces an array by line with + | + { + "groupId": .[0]|tonumber, + "entryType": .[1], + "databaseName": .[2], + "identifier": .[3], + "name": .[4], + "description": .[5], + "url": .[6], + "species": .[7]|tostring|split("%2C "), # TODO: check that this split is done correctly, I doubt... + "linkedResourcesID": + ([ + foreach (.[8]? | tostring | split(", ")[]) as $pui + ([[],[]]; + if ($pui != null and $pui != "" and $pui != "null") then + ($pui | @base64) + else + empty + end + ) + ]) + } +] diff --git a/etl_gnpis-core_dd/sql/count_extracted_data.sql b/etl_gnpis-core_dd/sql/count_extracted_data.sql new file mode 100644 index 0000000..bfe6e9e --- /dev/null +++ b/etl_gnpis-core_dd/sql/count_extracted_data.sql @@ -0,0 +1,83 @@ +-- Example of usage: +-- psql -h shelob.versailles.inrae.fr -p 9122 -U scratchy -d scratchy -tA -v type=germplasm -v trialId=NULL -f count_extracted_data.sql + +select CASE + -- + -- genetique ressources + WHEN :'type' = 'genetic_resources_accessions' THEN + (select count(distinct accession_id) + from accession) + -- + -- association + WHEN :'type' = 'association_analyses' THEN + (select count(distinct association_analysis_id) + from association_analysis) + -- + -- phenotyping + WHEN :'type' = 'phenotyping_trials' THEN + (select count(distinct trial_id) + from trial) + -- + -- genotyping/polymorphism + WHEN :'type' = 'genotyping_experiments' THEN + (select count(distinct genotyping_experiment_id) + from genotyping_experiment) + -- + -- cartography + WHEN :'type' = 'mapping_maps' THEN + (select count(distinct map_id) + from map) + WHEN :'type' = 'mapping_mapped_markers' THEN + (select count(distinct m.marker_id) + from marker m + inner join locus l on l.marker_id = m.marker_id) + WHEN :'type' = 'mapping_not_mapped_markers' THEN + (select count(distinct m.marker_id) + from marker m + left join locus l on l.marker_id = m.marker_id + where l.marker_id is null) + WHEN :'type' = 'mapping_qtls' THEN + (select count(distinct mappable_elemt_id) + from qtl) + WHEN :'type' = 'mapping_metaqtls' THEN + (select count(distinct mappable_elemt_id) + from meta_qtl) + -- + -- sequences + WHEN :'type' = 'sequences_ngs_experiments' THEN + (select count(distinct experiment_id) + from ngs_experiment) + WHEN :'type' = 'sequences_ngs_analyses' THEN + (select count(distinct analysis_id) + from ngs_analysis) + -- + -- synteny + --WHEN :'type' = 'synteny_genes' THEN + --(select count(distinct ga.GENE_ASSIGNMENT_ID) + -- --COUNT(distinct 'SYNTENY_DS_' || d.dataset_id || '_AC_' || ac.ANCESTRAL_CHROMOSOME_NAME || '_' || g.gene_name ) + -- --distinct ('SYNTENY_DS_' || d.dataset_id || '_AC_' || ac.ANCESTRAL_CHROMOSOME_NAME || '_' || g.gene_name ) + --FROM GENE_ASSIGNMENT ga + --JOIN GENE g on ga.GENE_ID = g.GENE_ID + --JOIN GENE_HOMOLOGY_GROUP ghg ON ghg.GENE_ID = g.GENE_ID + --JOIN HOMOLOGY_GROUP hg ON hg.HOMOLOGY_GROUP_ID = ghg.HOMOLOGY_GROUP_ID + --JOIN DATASET d ON d.DATASET_ID = hg.DATASET_ID + --JOIN ANCESTRAL_GENE ag on hg.ancestral_gene_id=ag.ancestral_gene_id + --JOIN ANCESTRAL_CHROMOSOME ac on ac.ANCESTRAL_CHROMOSOME_ID=ag.ANCESTRAL_CHROMOSOME_ID + --WHERE d.IS_CURRENT_VERSION='true' + --AND d.DATASET_TYPE_ID=450) + -- --AND d.DATASET_ID = 6 + -- + -- transcriptome + WHEN :'type' = 'transcriptome_experiments' THEN + (select count(distinct experiment_id) + from experiment) + WHEN :'type' = 'transcriptome_genes' THEN + (select count(distinct g.gene_id) + from gene g + join gene_gene_list ggl on ggl.gene_id = g.gene_id + join gene_list gl on gl.gene_list_id = ggl.gene_list_id) + WHEN :'type' = 'transcriptome_gene_lists' THEN + (select count(distinct gene_list_id) + from gene_list) + END +as count; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_association_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_association_extraction.sql new file mode 100644 index 0000000..03357bf --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_association_extraction.sql @@ -0,0 +1,85 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, R. Flores, D. Charruaud +-- Created on 2014/07/22 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ################################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: association +-- ################################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + + +-- extract association analyses + +\o gnpis_'':thematic''_analyses.csv + +SELECT DISTINCT + '"' || CAST(A.group_id as VARCHAR(3)) || '"' AS group_id, + '"GWAS analysis"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + '"' || CONCAT('GWAS_ANALYSIS_' , A.ASSOCIATION_ANALYSIS_ID) || '_' || A.ANALYSIS_NAME || '"' AS identifier, + '"' || REPLACE(A.ANALYSIS_NAME, '"', '''') ||'"' AS name, + '"' || + CONCAT(REPLACE(A.ANALYSIS_NAME, '"', ''''), ' is a GWAS analysis', + CASE WHEN OT.NAME_EN IS NOT NULL AND OT.NAME_EN != '' THEN + ' related to ' || OT.NAME_EN + END, + CASE WHEN OT.DEFINITION_EN IS NOT NULL AND OT.DEFINITION_EN != '' THEN + ' (' || OT.DEFINITION_EN || ')' + END, + ' involving ' , DNA.PANEL_NAME, ' panel, in the scope of ', GE.EXPERIMENT_NAME, ' experiment', + -- ' involving ' , P.PANEL_NAME, ' panel, in the scope of ', GE.EXPERIMENT_NAME, ' experiment', + CASE WHEN GE.DESCRIPTION IS NOT NULL AND GE.DESCRIPTION != '' THEN + ' which is described as: ''' || GE.DESCRIPTION || '''' END, + CASE WHEN G.GENOME_NAME IS NOT NULL AND G.GENOME_NAME != '' THEN + ' on genome ' || G.GENOME_NAME || ' (' || T.SCIENTIFIC_NAME || ')' END + , '. Phenotyping campaign: ' , PC.NAME , + ' markers :', string_agg(DISTINCT(DNA.MARKER_NAME), ' , '), + ' and traits: ', string_agg(DISTINCT(DNA.TRAIT_NAME), ' , ') + ) || '."' AS description, + '"' || CONCAT(:'application_url', '/association/association/viewer.do#results/analysisIds=', A.ASSOCIATION_ANALYSIS_ID) || '"' AS url, + '"' || CASE WHEN taxons IS NULL THEN '' ELSE taxons END || '"' AS species, + '"' || nullif(concat_ws(', ', + CASE WHEN encoded_puids IS NOT NULL THEN encoded_puids END, + CASE WHEN tr.trial_number IS NOT NULL THEN ('urn:URGI/study/'||tr.trial_number)::text END, + CASE WHEN tr.site_id IS NOT NULL THEN ('urn:URGI/location/'||tr.site_id)::text END + ), '') || '"' AS linkedRessourcesID +FROM ASSOCIATION_ANALYSIS A + JOIN OBSERVATION_VARIABLE OV ON OV.OBSERVATION_VARIABLE_ID = A.VARIABLE_ID + LEFT JOIN ONTOLOGY_TERM OT ON OT.ONTOLOGY_TERM_ID = OV.DESCRIPTOR_ID + LEFT JOIN PHENOTYPING_CAMPAIGN PC ON PC.PHENOTYPING_CAMPAIGN_ID = A.PHENOTYPING_CAMPAIGN_ID + LEFT JOIN TRIAL TR ON TR.TRIAL_ID = PC.TRIAL_ID + JOIN GWAS_EXPERIMENT GE ON GE.GWAS_EXPERIMENT_ID = A.GWAS_EXPERIMENT_ID + -- LEFT JOIN PANEL P ON P.PANEL_ID = GE.PANEL_ID + LEFT JOIN TREATMENT_FACTOR TF ON TF.TREATMENT_FACTOR_ID = A.TREATMENT_FACTOR_ID + LEFT JOIN GENOME G ON G.GENOME_ID = A.GENOME_VERSION_ID + LEFT JOIN TAXON T ON T.TAXON_ID = G.TAXON_ID + LEFT JOIN DN_ASSOCIATION DNA ON DNA.ASSOCIATION_ANALYSIS_ID = A.ASSOCIATION_ANALYSIS_ID + LEFT JOIN( + SELECT DISTINCT GE.GWAS_EXPERIMENT_ID AS ge_id, + string_agg( + distinct( + CASE WHEN a.puid like 'gnpis_pui%' then + 'urn:URGI/' ||(replace(a.puid, ':', '%3A')) + ELSE + a.puid + END + ) + , ', ' + ) AS encoded_puids, + string_agg(distinct(t.SCIENTIFIC_NAME), ', ') AS taxons + FROM GWAS_EXPERIMENT GE + LEFT JOIN PANEL_LOT pl ON pl.PANEL_ID = GE.PANEL_ID + LEFT JOIN LOT l ON l.LOT_ID = pl.LOT_ID + LEFT JOIN ACCESSION a ON a.ACCESSION_ID = l.ACCESSION_ID + JOIN TAXON t on t.TAXON_ID = a.TAXON_ID + GROUP BY GE.GWAS_EXPERIMENT_ID) AS PUID_ACCESSIONS ON PUID_ACCESSIONS.ge_id = GE.GWAS_EXPERIMENT_ID + group by GE.GWAS_EXPERIMENT_ID, GE.EXPERIMENT_NAME, A.GROUP_ID, PUID_ACCESSIONS.ENCODED_PUIDS, A.ANALYSIS_NAME, A.ASSOCIATION_ANALYSIS_ID, OT.NAME_EN, OT.DEFINITION_EN, DNA.PANEL_NAME, G.GENOME_NAME, GE.DESCRIPTION, PC.NAME, TR.SITE_ID, TR.TRIAL_NUMBER, T.SCIENTIFIC_NAME, PUID_ACCESSIONS.TAXONS +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_genetic_resources_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_genetic_resources_extraction.sql new file mode 100644 index 0000000..6488536 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_genetic_resources_extraction.sql @@ -0,0 +1,146 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): R. Flores, D. Charruaud, E. Kimmel +-- Created on 2014/12/08 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ######################################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: genetic resources +-- ######################################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extract ACCESSION + +\o gnpis_'':thematic''_accessions.csv + +SELECT DISTINCT + '"' || CAST(A.group_id as VARCHAR(3)) || '"' AS group_id, + '"Germplasm"' AS entry_type, + '"' || :'source_name' || '"' AS database_name, + '"' || a.puid || '"' AS identifier, + '"' || replace(a.accession_name, '"', '''') || ' (' || a.accession_number || ')"' AS name, + '"' || CONCAT( + replace(a.accession_name, '"', '''') , + ' is a ' || t.scientific_name , + ' accession (number: ' || accession_number || ')', + CASE WHEN status.name_en IS NOT NULL THEN + ' (status: ' || status.name_en || ')' END , + CASE WHEN grc.grc_code IS NOT NULL THEN + ' maintained by the ' || lower(grc.grc_code) || ' BRC (managed by ' || grci.organization || ')' END , + CASE WHEN hi.organization IS NOT NULL THEN + ', held by ' || hi.organization END , + CASE WHEN di.organization IS NOT NULL THEN + ', given by ' || di.organization || + CASE WHEN a.donation_date IS NOT NULL THEN + ' on ' || a.donation_date END + END , + CASE WHEN ci.organization IS NOT NULL THEN + ', collected by ' || ci.organization || + CASE WHEN a.collecting_date IS NOT NULL THEN + ' on ' || a.collecting_date END + END , + CASE WHEN acc_coll.collections IS NOT NULL THEN + '. This accession is part of collection(s): ' || acc_coll.collections END , + CASE WHEN agg_taxon_common_names.taxon_synonym_names IS NOT NULL THEN + '. Its taxon is also known as: ' || agg_taxon_common_names.taxon_synonym_names END , + CASE WHEN agg_taxon_synonym.taxon_synonym_names IS NOT NULL THEN + '. This taxon has also some synonym(s): ' || agg_taxon_synonym.taxon_synonym_names END , + CASE WHEN synonym_names IS NOT NULL THEN + '. This accession has also some synonym(s): ' || synonym_names END + ) ||'"' AS description, + '"' || CONCAT(:'application_url', '/germplasm?pui=', a.puid) || '"' AS url, + '"' || t.scientific_name || '"' AS species, + '"' || concat_ws(', ', + CASE WHEN acc_sites.encoded_sites IS NOT NULL AND acc_sites.encoded_sites != '' THEN acc_sites.encoded_sites END, + CASE WHEN acc_trial.encoded_trials_sites IS NOT NULL AND acc_trial.encoded_trials_sites != '' THEN acc_trial.encoded_trials_sites END, + CASE WHEN acc_trial.encoded_trials IS NOT NULL AND acc_trial.encoded_trials != '' THEN acc_trial.encoded_trials END, + CASE WHEN acc_geno.encoded_genotypings IS NOT NULL AND acc_geno.encoded_genotypings != '' THEN acc_geno.encoded_genotypings END + ) || '"' AS linkedRessourcesID +FROM accession a +JOIN taxon t ON a.taxon_id = t.taxon_id +-- aggregates sites in one line +LEFT JOIN + (select accession_id as aid, + concat_ws(', ', + ('urn:URGI/location/'||site_id)::text, + ('urn:URGI/location/'||origin_site_id)::text + ) AS encoded_sites + from accession + ) as acc_sites on acc_sites.aid = a.accession_id +LEFT JOIN institution hi ON a.holding_institution_id = hi.institution_id +LEFT JOIN institution bi ON a.breeder_institution_id = bi.institution_id +LEFT JOIN institution di ON a.donor_institution_id = di.institution_id +LEFT JOIN institution ci ON a.collector_institution_id = ci.institution_id +LEFT JOIN ontology_term status ON a.presence_status_id = status.ontology_term_id +LEFT JOIN grc grc ON grc.grc_id = a.grc_id +LEFT JOIN institution grci ON grc.managing_institution_id = grci.institution_id +-- aggregates accession's collections in one line +LEFT JOIN ( + SELECT + a.accession_id AS aid, string_agg(distinct(tr.translated_name), ', ') AS collections + FROM accession a + LEFT JOIN accession_collection ac ON ac.accession_id = a.accession_id + LEFT JOIN collections coll ON coll.collection_id = ac.collection_id + LEFT JOIN translations tr ON tr.named_collection_id = coll.collection_id + WHERE tr.language_id = (SELECT language_id FROM languages WHERE language_code = 'en') + GROUP BY a.accession_id + ORDER BY a.accession_id +) AS acc_coll ON acc_coll.aid = a.accession_id +-- aggregates accession's synonyms in one line +LEFT JOIN ( + SELECT acc.accession_id AS aids, string_agg(distinct(accsyn.accession_synonym_name), ', ') AS synonym_names + FROM accession acc + JOIN accession_synonym accsyn on accsyn.accession_id = acc.accession_id + GROUP BY acc.accession_id) AS acc_synonyms ON a.accession_id = acc_synonyms.aids +-- aggregates taxon's common names in one line +LEFT JOIN + (select taxon_id as t_id, string_agg(distinct(ta_synonym_name), ', ') as taxon_synonym_names + from ( + select distinct ta.taxon_id as taxon_id, ts.taxon_synonym_name as ta_synonym_name + from taxon ta + join taxon_synonym_taxon tst on tst.taxons_id = ta.taxon_id + join taxon_synonym ts on ts.taxon_synonym_id = tst.taxon_synonyms_id + join ontology_term ot ON ot.ontology_term_id = ts.name_type_id + where ot.textual_code != 'SCIENTIFIC') as t_id_t_synonym + group by t_id_t_synonym.taxon_id + ) as agg_taxon_common_names on agg_taxon_common_names.t_id = t.taxon_id +-- aggregates taxon's synonyms in one line +LEFT JOIN + (select taxon_id as t_id, string_agg(distinct(ta_synonym_name), ', ') as taxon_synonym_names + from ( + select distinct ta.taxon_id as taxon_id, ts.taxon_synonym_name as ta_synonym_name + from taxon ta + join taxon_synonym_taxon tst on tst.taxons_id = ta.taxon_id + join taxon_synonym ts on ts.taxon_synonym_id = tst.taxon_synonyms_id + join ontology_term ot ON ot.ontology_term_id = ts.name_type_id + where ot.textual_code = 'SCIENTIFIC') as t_id_t_synonym + group by t_id_t_synonym.taxon_id + ) as agg_taxon_synonym on agg_taxon_synonym.t_id = t.taxon_id +-- aggregates trials in one line +LEFT JOIN ( + SELECT a.accession_id AS aid, string_agg(distinct(('urn:URGI/study/'||trial.trial_number)::text), ', ') AS encoded_trials, + string_agg(distinct(('urn:URGI/location/'||site.site_id)::text), ', ') AS encoded_trials_sites + FROM accession a + LEFT JOIN lot l ON l.accession_id = a.accession_id + LEFT JOIN trial_lot tl ON tl.lots_id = l.lot_id + LEFT JOIN trial trial ON trial.trial_id = tl.trials_id + LEFT JOIN site site ON site.site_id = trial.site_id + GROUP BY a.accession_id +) AS acc_trial ON acc_trial.aid = a.accession_id +-- aggregates genotypings in one line +LEFT JOIN ( + SELECT a.accession_id AS aid, string_agg(distinct(('urn:URGI/study/'||ge.genotyping_experiment_id)::text), ', ') AS encoded_genotypings + FROM accession a + LEFT JOIN lot l ON l.accession_id = a.accession_id + LEFT JOIN genotyping_exp_lot gel ON gel.lot_id = l.lot_id + LEFT JOIN genotyping_experiment ge ON ge.genotyping_experiment_id = gel.genotyping_experiment_id + GROUP BY a.accession_id +) AS acc_geno ON acc_geno.aid = a.accession_id + +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_genotyping_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_genotyping_extraction.sql new file mode 100644 index 0000000..f29d332 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_genotyping_extraction.sql @@ -0,0 +1,103 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, D. Charruaud +-- Created on 2014/12/05 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ################################################################################## +-- SQL script used to extract data for transPLANT indices, gnpis thematic: genotyping +-- ################################################################################## + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extracting GENOTYPING_EXPERIMENT + +\o gnpis_'':thematic''_experiments.csv + +SELECT DISTINCT + '"' || CAST(ge.group_id as VARCHAR(3)) || '"' AS group_id, + '"Genotyping Study"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + '"' || ge.GENOTYPING_EXPERIMENT_NAME || ge.GENOTYPING_EXPERIMENT_ID || '"' AS identifier, + '"' || REPLACE( + ge.GENOTYPING_EXPERIMENT_NAME, '"', '''' + ) ||'"' AS name, + '"' || + CONCAT( + replace(ge.GENOTYPING_EXPERIMENT_NAME, '"', '''') , + ' is an experiment (type: ' , bt.name, ')' , + CASE WHEN scientific_names IS NULL THEN ', ' + ELSE ' ran using samples of ' || scientific_names || ', ' END , + ' based on the marker set ', ms.MARKER_SET_NAME, '.', + CASE WHEN ge.PANEL_ID IS NULL THEN '' + ELSE ' The panel used is ' || p.PANEL_NAME || '.' + END , + CASE WHEN ge.PROJECT_ID IS NULL THEN '' + ELSE ' This experiment belongs to the scientific project ' || pr.PROJECT_CODE || '' + END, + CASE WHEN h.HARDWARE_ID IS NULL OR h.HARDWARE_NAME = 'unknown' THEN '.' + ELSE ' and use the hardware ' || h.HARDWARE_NAME || ', model ' || h.MODEL || '.' + END, + CASE WHEN acc_names IS NULL THEN '' + ELSE ' Accession names: ' || acc_names || '.' + END, + CASE WHEN acc_numbers IS NULL THEN '' + ELSE ' Accession number: ' || acc_numbers || '.' + END, + CASE WHEN acc_synonyms IS NULL THEN '' + ELSE ' Accession synonyms: ' || acc_synonyms || '.' + END + ) ||'"' AS description, + '"' || CONCAT(:'application_url', '/GnpSNP/snp/genotyping/form.do#results/experimentIds=', ge.GENOTYPING_EXPERIMENT_ID) || '"' AS url, + '"' || CASE WHEN scientific_names IS NULL THEN '' ELSE scientific_names END || '"' AS species, + '"' || CASE WHEN encoded_puids IS NULL THEN '' ELSE encoded_puids END || '"' AS linkedRessourcesID +FROM + GENOTYPING_EXPERIMENT ge + LEFT JOIN PANEL p ON p.PANEL_ID = ge.PANEL_ID + JOIN MARKER_SET ms ON ms.MARKER_SET_ID = ge.MARKER_SET_ID + LEFT JOIN PROJECT pr ON pr.PROJECT_ID = ge.PROJECT_ID + LEFT JOIN PROTOCOL pro ON pro.PROTOCOL_ID = ge.PROTOCOL_ID + LEFT JOIN HARDWARE h ON h.HARDWARE_ID = pro.HARDWARE_ID + JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = ge.GENOTYPING_TYPE_ID + LEFT JOIN GENOTYPING_EXP_LOT gel ON gel.GENOTYPING_EXPERIMENT_ID = ge.GENOTYPING_EXPERIMENT_ID + LEFT JOIN LOT l ON l.LOT_ID = gel.LOT_ID + LEFT JOIN ACCESSION acc ON acc.ACCESSION_ID = l.ACCESSION_ID + LEFT JOIN TAXON tax ON tax.TAXON_ID = acc.TAXON_ID + INNER JOIN + (SELECT + GENOTYPING_EXPERIMENT_ID AS ge_id, + string_agg(distinct(acc_name), ', ') AS acc_names, + string_agg(distinct(acc_number), ', ') AS acc_numbers, + string_agg(distinct(acc_synonym), ', ') AS acc_synonyms, + string_agg(distinct(scientific_name), ', ') AS scientific_names, + string_agg( + distinct( + CASE WHEN acc_puid like 'gnpis_pui%' then + 'urn:URGI/' ||(replace(acc_puid, ':', '%3A')) + ELSE + acc_puid + END + ) + , ', ' + ) AS encoded_puids + FROM + (SELECT DISTINCT + ge.GENOTYPING_EXPERIMENT_ID AS GENOTYPING_EXPERIMENT_ID, + a.ACCESSION_NAME AS acc_name, + a.ACCESSION_NUMBER AS acc_number, + acs.ACCESSION_SYNONYM_NAME AS acc_synonym, + tax.SCIENTIFIC_NAME AS scientific_name, + a.PUID AS acc_puid + FROM GENOTYPING_EXPERIMENT ge + LEFT JOIN GENOTYPING_EXP_LOT gel ON gel.GENOTYPING_EXPERIMENT_ID = ge.GENOTYPING_EXPERIMENT_ID + LEFT JOIN LOT l ON l.LOT_ID = gel.LOT_ID + LEFT JOIN ACCESSION a ON a.ACCESSION_ID = l.ACCESSION_ID + LEFT JOIN ACCESSION_SYNONYM acs ON acs.ACCESSION_ID = a.ACCESSION_ID + LEFT JOIN TAXON tax ON tax.TAXON_ID = a.TAXON_ID) AS GEXP_ID_W_ACCESSIONS + GROUP BY GEXP_ID_W_ACCESSIONS.GENOTYPING_EXPERIMENT_ID) AS DISTINCT_AGG_ACCESSIONS ON DISTINCT_AGG_ACCESSIONS.ge_id = ge.GENOTYPING_EXPERIMENT_ID +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_mapping_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_mapping_extraction.sql new file mode 100644 index 0000000..a1a00a9 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_mapping_extraction.sql @@ -0,0 +1,277 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, R. Flores, D. Charruaud +-- Created on 2014/07/22 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ############################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: mapping +-- ############################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extract MAPS + +\o gnpis_'':thematic''_maps.csv + +select distinct + '"' || CAST(m.group_id as VARCHAR(3)) || '"' AS group_id, + '"Genetic map"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('GENETIC_MAP_' , MAP_ID,'_',MAP_NAME)) AS identifier, + '"' || replace(m.map_name, '"', '''') || '"' AS name, + QUOTE_IDENT( + CONCAT( + MAP_NAME , ' is a ' , + CASE (m.IS_CONSENSUS) + WHEN 0 THEN '' + WHEN 1 THEN 'consensus ' + END , + CASE WHEN t.SCIENTIFIC_NAME IS NULL THEN '' + ELSE t.SCIENTIFIC_NAME || ' ' END , + CASE WHEN bt.NAME IS NULL THEN ' map' + ELSE bt.NAME || ' map' END , + ' created on ', CAST(m.map_date AS DATE), + ' involving population ' , pop.POPULATION_NAME , + CASE WHEN pop.POPULATION_AUTHOR IS NULL THEN '' + ELSE ' (authored by ' || pop.POPULATION_AUTHOR || ')' END, + CASE WHEN m.UNIT IS NULL OR m.UNIT = '' THEN '' + ELSE '. Its unit is: ' || m.UNIT END , + '. Map contact is ' , c.FIRST_NAME, ' ' , c.LAST_NAME , ' from ' , i.INSTITUTION_NAME , + CASE WHEN i.ORGANIZATION IS NULL OR i.ORGANIZATION = '' THEN '.' + ELSE ', ' || i.ORGANIZATION || '.' END + ) + ) AS DESCRIPTION, + QUOTE_IDENT(CONCAT(:'application_url','/GnpMap/mapping/id.do?action=MAP&id=', MAP_ID)) AS url, + QUOTE_IDENT(t.SCIENTIFIC_NAME) AS species, + '""' AS linkedRessourcesID +FROM map m + JOIN taxon t ON m.taxon_id = t.taxon_id + JOIN bio_type bt ON bt.BIO_TYPE_ID = m.BIO_TYPE_ID + JOIN population pop ON m.POPULATION_ID = pop.POPULATION_ID + JOIN contact c ON c.CONTACT_ID = m.CONTACT_ID + JOIN institution i ON i.INSTITUTION_ID = c.INSTITUTION_ID +ORDER BY identifier; + +-- extract QTL + +\o gnpis_'':thematic''_qtls.csv + +SELECT DISTINCT + '"' || CAST(q.group_id as VARCHAR(3)) || '"' AS group_id, + '"QTL"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('QTL_', q.MAPPABLE_ELEMT_ID,'_', q.QTL_NAME)) AS identifier, + '"' || replace(q.QTL_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT( + CONCAT( + q.QTL_NAME , ' is a ', t.trait_name, ' QTL which has been detected on ', qd.QTL_DETEC_DATE , + CASE WHEN qd.METHOD IS NOT NULL AND qd.METHOD != '' THEN + ', using method ' || qd.METHOD || + CASE (lower(qd.PARAMETER)) WHEN 'unknown' THEN '' + ELSE ' with parameter(s) ' || qd.PARAMETER END + END, + '. This QTL is mapped on ', map_names, ' map(s) (', taxon_names, ').' + ) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url','/GnpMap/mapping/id.do?action=QTL&id=',q.MAPPABLE_ELEMT_ID)) AS url, + QUOTE_IDENT(CASE WHEN taxon_names IS NOT NULL THEN taxon_names ELSE '' END) AS species, + '""' AS linkedRessourcesID +FROM QTL q +JOIN QTL_DETECTION qd ON q.QTL_DETEC_ID = qd.QTL_DETEC_ID +JOIN ASSIGNMENT a ON a.MAPPABLE_ELEMT_ID = q.MAPPABLE_ELEMT_ID +JOIN MEASURE mea ON mea.MEASURE_ID = qd.MEASURE_ID +JOIN TRAIT t ON mea.TRAIT_ID = t.TRAIT_ID +left join +(select qtl.MAPPABLE_ELEMT_ID as qtl_id, + string_agg(distinct(map.MAP_NAME), ', ') as map_names, + string_agg(distinct(tax.SCIENTIFIC_NAME), ',') as taxon_names + from qtl qtl + JOIN assignment ass ON ass.MAPPABLE_ELEMT_ID = qtl.MAPPABLE_ELEMT_ID + JOIN map map on map.MAP_ID = ass.MAP_ID + JOIN taxon tax on tax.TAXON_ID = map.TAXON_ID + group by qtl.MAPPABLE_ELEMT_ID) as qtl_map on qtl_map.QTL_ID = q.MAPPABLE_ELEMT_ID +WHERE a.IS_QTL = 'yes' +ORDER BY identifier; + +-- extract MetaQTL + +\o gnpis_'':thematic''_metaqtls.csv + +SELECT DISTINCT + '"' || CAST(mq.group_id as VARCHAR(3)) || '"' AS group_id, + '"QTL"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('META_QTL_', mq.MAPPABLE_ELEMT_ID,'_', mq.META_QTL_NAME)) AS identifier, + '"' || replace(mq.META_QTL_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT( + mq.META_QTL_NAME, ' is a ', t.TRAIT_NAME, + ' MetaQTL found from the meta-analysis: ', ma.META_ANALYSIS_NAME, + ' with the ', ma.META_ANALYSIS_METHOD, ' method.', + ' This MetaQTL is mapped on ', tax.SCIENTIFIC_NAME, ' ', m.MAP_NAME, ' map.') + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url','/GnpMap/mapping/card.do?&dbName=mapping&className=metaqtl.MetaQtlImpl&id=', mq.MAPPABLE_ELEMT_ID)) AS url, + QUOTE_IDENT(tax.SCIENTIFIC_NAME) AS species, + '""' AS linkedRessourcesID +FROM META_QTL mq +JOIN META_ANALYSIS ma ON mq.META_ANALYSIS_ID = ma.META_ANALYSIS_ID +JOIN ASSIGNMENT a ON a.MAPPABLE_ELEMT_ID = mq.MAPPABLE_ELEMT_ID +JOIN MAP m ON m.MAP_ID = a.MAP_ID +JOIN TAXON tax ON tax.TAXON_ID = m.TAXON_ID +JOIN TRAIT t ON mq.TRAIT_ID = t.TRAIT_ID +WHERE a.IS_META_QTL = 'yes' +ORDER BY identifier; + +-- extract MAPPED MARKERS + +\o gnpis_'':thematic''_mapped_markers.csv + +SELECT DISTINCT + '"' || CAST(m.group_id as VARCHAR(3)) || '"' AS group_id, + '"Marker"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('MARKER_', m.MARKER_ID,'_', m.MARKER_NAME)) AS identifier, + '"' || replace(m.MARKER_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT( + CONCAT( + MARKER_NAME , ' is a' , + CASE (lower(bt.NAME)) WHEN 'unknown' THEN '' + ELSE ' ' || bt.NAME END , + ' mapped marker from taxon ', t.SCIENTIFIC_NAME, + CASE WHEN agg_marker_synonym.MARKER_SYNONYM_NAMES IS NOT NULL THEN + '. This marker has some synonyms: ' || agg_marker_synonym.MARKER_SYNONYM_NAMES END , + '. Its locus is/are: ', locus_names, + CASE WHEN positions IS NOT NULL THEN + ' at positions ' || positions END , + CASE WHEN map_names IS NOT NULL THEN + ' on map(s): ' || map_names END , + CASE WHEN m.GENE_FUNCTION IS NULL OR m.GENE_FUNCTION = '' THEN '' + ELSE + CASE (lower(m.GENE_FUNCTION)) WHEN 'unknown' THEN '' + ELSE '. Its gene function is: ' || m.GENE_FUNCTION + END + END , + CASE WHEN m.CONTIG_NAME IS NULL OR m.CONTIG_NAME = '' THEN '' + ELSE '. Its contig name is: ' || m.CONTIG_NAME END , + CASE WHEN m.INSERT_LENGTH IS NULL THEN '' + ELSE '. Its insert length is: ' || m.INSERT_LENGTH END , + CASE WHEN m.REVERSE_PRIMER IS NULL OR m.REVERSE_PRIMER = '' THEN '' + ELSE '. Reverse primer: ' || m.REVERSE_PRIMER END , + CASE WHEN m.FORWARD_PRIMER IS NULL OR m.FORWARD_PRIMER = '' THEN '' + ELSE '. Forward primer: ' || m.FORWARD_PRIMER END , + -- gestion des marqueurs Kaspar (Don't remove for FD, Don't uncomment for URGI) + -- CASE WHEN sequence_names IS NOT NULL OR sequence_names != '' THEN + -- '. Sequence name(s): ' || sequence_names END , + CASE WHEN m.SHORT_REMARK IS NULL OR m.SHORT_REMARK = '' THEN '' + -- remove all quotes in user's remarks + ELSE '. Short remark linked: ''' || REPLACE(m.SHORT_REMARK, '"', '') || '''' END + ) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url','/GnpMap/mapping/id.do?dbName=mapping&action=MARKER&className=MarkerImpl&id=', m.MARKER_ID)) AS url, + QUOTE_IDENT(t.SCIENTIFIC_NAME) AS species, + '""' AS linkedRessourcesID +FROM marker m +JOIN taxon t ON m.TAXON_ID = t.TAXON_ID +JOIN bio_type bt ON bt.BIO_TYPE_ID = m.BIO_TYPE_ID +INNER JOIN locus l ON l.MARKER_ID = m.MARKER_ID +LEFT JOIN +(select marker_id as m_id, string_agg(distinct(ma_synonym_name), ', ') as marker_synonym_names +from ( +select distinct ma.MARKER_ID as marker_id, ms.MARKER_SYNONYM_NAME as ma_synonym_name +from marker ma +join marker_synonym_marker msm on msm.MARKER_ID = ma.MARKER_ID +join marker_synonym ms on ms.MARKER_SYNONYM_ID = msm.MARKER_SYNONYM_ID) as m_id_m_synonym +group by m_id_m_synonym.MARKER_ID) as agg_marker_synonym on agg_marker_synonym.M_ID = m.MARKER_ID +-- gestion des marqueurs Kaspar (Don't remove for FD, Don't uncomment for URGI) +-- LEFT JOIN +-- (select marker_id as mid, string_agg(distinct(sequence_name), ', ') as sequence_names +-- from ( +-- select distinct ma.marker_id as marker_id, mseq.sequence_name as sequence_name +-- from marker ma +-- join marker_sequence mseq on ma.MARKER_ID = mseq.MARKER_ID) as seq_m_id +-- group by seq_m_id.marker_id) as marker_seq on marker_seq.mid = m.marker_id +left join +(select ma.MARKER_ID as ma_id, + string_agg(distinct(map.MAP_NAME), ', ') as map_names, + string_agg(distinct(l.LOCUS_NAME), ', ') as locus_names, + string_agg(CAST(pa.ABS_DISTANCE AS VARCHAR(6)), ', ') as positions + from marker ma + INNER JOIN locus l ON l.MARKER_ID = ma.MARKER_ID + JOIN mappable_element me ON me.MAPPABLE_ELEMT_ID = l.MAPPABLE_ELEMT_ID + LEFT JOIN assignment ass ON ass.MAPPABLE_ELEMT_ID = me.MAPPABLE_ELEMT_ID + LEFT JOIN point_assignment pa ON pa.ASSIGNMENT_ID = ass.ASSIGNMENT_ID + LEFT JOIN map map on map.MAP_ID = ass.MAP_ID + group by ma.MARKER_ID) AS marker_map ON marker_map.ma_id = m.MARKER_ID +ORDER BY identifier; + + +-- extract NOT MAPPED MARKERS + +\o gnpis_'':thematic''_not_mapped_markers.csv + +SELECT DISTINCT + '"' || CAST(m.group_id as VARCHAR(3)) || '"' AS group_id, + '"Marker"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('MARKER_', m.MARKER_ID,'_NOT_MAPPED_', m.MARKER_NAME)) AS identifier, + '"' || replace(m.MARKER_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT( + CONCAT( + MARKER_NAME , ' is a' , + CASE (lower(bt.NAME)) WHEN 'unknown' THEN '' + ELSE ' ' || bt.NAME END , + ' marker from taxon ' , t.SCIENTIFIC_NAME, + CASE WHEN agg_marker_synonym.MARKER_SYNONYM_NAMES IS NOT NULL THEN + '. This marker has some synonyms: ' || agg_marker_synonym.MARKER_SYNONYM_NAMES END , + CASE WHEN l.LOCUS_NAME IS NULL THEN '' + ELSE '. Its locus is: ' || l.LOCUS_NAME END , + CASE WHEN m.GENE_FUNCTION IS NULL OR m.GENE_FUNCTION = '' THEN '' + ELSE + CASE (lower(m.GENE_FUNCTION)) WHEN 'unknown' THEN '' + ELSE '. Its gene function is: ' || m.GENE_FUNCTION + END + END , + CASE WHEN m.CONTIG_NAME IS NULL OR m.CONTIG_NAME = '' THEN '' + ELSE '. Its contig name is: ' || m.CONTIG_NAME END , + CASE WHEN m.INSERT_LENGTH IS NULL THEN '' + ELSE '. Its insert length is: ' || m.INSERT_LENGTH END , + CASE WHEN m.REVERSE_PRIMER IS NULL OR m.REVERSE_PRIMER = '' THEN '' + ELSE '. Reverse primer: ' || m.REVERSE_PRIMER END , + CASE WHEN m.FORWARD_PRIMER IS NULL OR m.FORWARD_PRIMER = '' THEN '' + ELSE '. Forward primer: ' || m.FORWARD_PRIMER END , + -- gestion des marqueurs Kaspar + CASE WHEN sequence_names IS NOT NULL OR sequence_names != '' THEN + '. Sequence name: ' || sequence_names END , + CASE WHEN m.SHORT_REMARK IS NULL OR m.SHORT_REMARK = '' THEN '' + ELSE '. Short remark linked: ''' || REPLACE(m.SHORT_REMARK, '"', '') || '''' END + ) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url','/GnpMap/mapping/id.do?dbName=mapping&action=MARKER&className=MarkerImpl&id=', m.MARKER_ID)) AS url, + QUOTE_IDENT(t.SCIENTIFIC_NAME) AS species, + '""' AS linkedRessourcesID +FROM marker m +JOIN taxon t ON m.TAXON_ID = t.TAXON_ID +JOIN bio_type bt ON bt.BIO_TYPE_ID = m.BIO_TYPE_ID +LEFT JOIN locus l ON l.MARKER_ID = m.MARKER_ID +LEFT JOIN +(select marker_id as m_id, string_agg(distinct(ma_synonym_name), ', ') as marker_synonym_names +from ( +select distinct ma.MARKER_ID as marker_id, ms.MARKER_SYNONYM_NAME as ma_synonym_name +from marker ma +join marker_synonym_marker msm on msm.MARKER_ID = ma.MARKER_ID +join marker_synonym ms on ms.marker_synonym_id = msm.MARKER_SYNONYM_ID) as m_id_m_synonym +group by m_id_m_synonym.MARKER_ID) as agg_marker_synonym on agg_marker_synonym.M_ID = m.MARKER_ID +LEFT JOIN +(select marker_id as mid, string_agg(distinct(sequence_name), ', ') as sequence_names +from ( +select distinct ma.MARKER_ID as marker_id, mseq.SEQUENCE_NAME as sequence_name +from marker ma +join marker_sequence mseq on ma.MARKER_ID = mseq.MARKER_ID) as seq_m_id +group by seq_m_id.MARKER_ID) as marker_seq on marker_seq.MID = m.MARKER_ID +WHERE l.MARKER_ID is null +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_phenotyping_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_phenotyping_extraction.sql new file mode 100644 index 0000000..4d1c95d --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_phenotyping_extraction.sql @@ -0,0 +1,89 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): R. Flores, D. Charruaud, E. Kimmel +-- Created on 2014/12/08 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ################################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: phenotyping +-- ################################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extract TRIALS + +\o gnpis_'':thematic''_trials.csv + +SELECT DISTINCT + '"' || CAST(t.group_id as VARCHAR(3)) || '"' AS group_id, + '"Phenotyping study"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + CONCAT('"TRIAL_', t.trial_id,'_',t.trial_number, '"') AS identifier, + '"' || replace(t.name, '"', '''') ||'"' AS name, + '"' || CONCAT( + REPLACE(t.trial_number, '"', ''''), ' is a trial lead at site: ', s.site_name, + CASE WHEN s.latitude IS NOT NULL AND s.longitude IS NOT NULL THEN + ' (lat/long: ' || CAST(s.latitude AS VARCHAR(6)) || '/' || CAST(s.longitude AS VARCHAR(6)) || ')' END, + CASE WHEN t.name != t.trial_number THEN + ', described as ''' || t.name || '''' END, + CASE WHEN pa.panel_name IS NOT NULL THEN + ', involving panel ' || pa.panel_name END, + CASE WHEN t.trial_design IS NOT NULL THEN + ', designed as follows: ''' || t.trial_design || '''' END, + CASE WHEN t.comments IS NOT NULL THEN + ', which comment is: ''' || t.comments || '''' END, + CASE WHEN observation_names IS NOT NULL THEN + '. Observation variables: ' || observation_names END, + CASE WHEN t.date_begin IS NOT NULL AND t.date_end IS NOT NULL THEN + '. This trial started on ' || CAST(t.date_begin AS DATE) || ' and finished on ' || CAST(t.date_end AS DATE) END, + CASE WHEN p.project_code IS NOT NULL THEN + ', in the frame of project: ''' || p.project_code || '''' END, + CASE WHEN accession_names IS NOT NULL THEN + '. Accession names: ' || accession_names || ' from taxon(s) ' || taxon_names END + ) ||'"' AS description, + '"' || CONCAT(:'application_url', '/ephesis/ephesis/viewer.do#trialCard/trialId=', t.trial_id) || '"' AS url, + '"' || CASE WHEN taxon_names IS NULL THEN '' ELSE taxon_names END || '"' AS species, + '"' || nullif(concat_ws(', ', + CASE WHEN encoded_puids IS NOT NULL THEN encoded_puids END, + CASE WHEN s.site_id IS NOT NULL THEN ('urn:URGI/location/'||s.site_id)::text END + ), '') || '"' AS linkedRessourcesID +FROM trial t +LEFT JOIN trial_lot tl ON tl.trials_id = t.trial_id +LEFT JOIN lot l ON l.lot_ID = tl.lots_id +LEFT JOIN accession a on a.accession_id = l.accession_id +JOIN site s ON s.site_id = t.site_id +LEFT JOIN panel pa ON pa.panel_id = t.panel_id +LEFT JOIN project p ON p.project_id = t.project_id +LEFT JOIN + (select tr.trial_id as tid, + string_agg(distinct(a.accession_name), ', ') AS accession_names, + string_agg(distinct(t.scientific_name), ', ') AS taxon_names, + string_agg( + distinct( + CASE WHEN a.puid like 'gnpis_pui%' then + 'urn:URGI/' ||(replace(a.puid, ':', '%3A')) + ELSE + a.puid + END + ) + , ', ' + ) AS encoded_puids + from trial tr + join trial_lot tl on tl.trials_id = tr.trial_id + join lot l on l.lot_id = tl.lots_id + join accession a on a.accession_id = l.accession_id + join taxon t on t.taxon_id = a.taxon_id + group by tr.trial_id) as trial_acc_tax on trial_acc_tax.tid = t.trial_id +LEFT JOIN + (select tr.trial_id as tid, + string_agg(distinct(CONCAT(ov.term_identifier,' ',ov.variable_specific_name)), ', ') AS observation_names + from trial tr + join trial_observation_variable tov on tov.trials_id = tr.trial_id + join observation_variable ov on tov.observation_variables_id = ov.observation_variable_id + group by tr.trial_id) as trial_ov on trial_ov.tid = t.trial_id +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_sequences_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_sequences_extraction.sql new file mode 100644 index 0000000..441ea68 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_sequences_extraction.sql @@ -0,0 +1,102 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, D. Charruaud +-- Created on 2014/12/05 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ################################################################################# +-- SQL script used to extract data for transPLANT indices, gnpis thematic: sequences +-- ################################################################################# + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extracting ngs experiment + +\o gnpis_'':thematic''_ngs_experiments.csv + +SELECT DISTINCT + '"' || CAST(ne.group_id as VARCHAR(3)) || '"' AS group_id, + '"Sequencing experiment"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('NGS_EXPERIMENT_' , ne.experiment_id,'_',ne.experiment_name)) AS identifier, + '"' || replace(ne.experiment_name, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT(ne.experiment_name , + ' is an experiment (type: ' , btstudy.name , ')', + CASE WHEN sample_names IS NOT NULL THEN + ' involving sample(s) ' || sample_names END, + CASE WHEN accession_names IS NOT NULL THEN + ' and accession(s) ' || accession_names || ' (' || taxon_names || ')' END, + CASE WHEN subrun_names IS NOT NULL THEN + ' in subrun(s) ' || subrun_names END, '.', + ' Sequencing type is ', btseqtype.name, '.', + ' The project is ', p.project_name, '.', + CASE WHEN ne.description IS NOT NULL OR ne.description != '' THEN + ' The description is: ' || ne.description END) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url', '/sequence/sequence/card/experiment.do?dbName=sequence&className=ngs.NgsExperimentImpl&id=' , ne.experiment_id)) AS url, + QUOTE_IDENT(CASE WHEN taxon_names IS NOT NULL THEN taxon_names ELSE '' END) AS species, + '"' || CASE WHEN encoded_puids IS NULL THEN '' ELSE encoded_puids END || '"' AS linkedRessourcesID +FROM NGS_EXPERIMENT ne +JOIN BIO_TYPE btstudy on btstudy.BIO_TYPE_ID = ne.study_type_id +JOIN BIO_TYPE btseqtype on btseqtype.BIO_TYPE_ID = ne.sequencing_type_id +JOIN PROJECT p on p.project_id = ne.project_id +LEFT JOIN +(select ep.experiment_id as eid, + string_agg(distinct(s.subrun_name), ', ') as subrun_names, + string_agg(distinct(ngss.sample_name), ', ') as sample_names, + string_agg(distinct(acc.accession_name), ', ') as accession_names, + string_agg( + distinct( + CASE WHEN acc.puid like 'gnpis_pui%' then + 'urn:URGI/' ||(replace(acc.puid, ':', '%3A')) + ELSE + acc.puid + END + ) + , ', ' + ) AS encoded_puids, + string_agg(distinct(t.scientific_name), ', ') as taxon_names +from experiment_pool ep +join subrun s on s.subrun_id = ep.subrun_id +join ngs_sample ngss on ngss.sample_id = ep.sample_id +left join accession acc on acc.accession_id = ngss.accession_id +join taxon t on t.taxon_id = acc.taxon_id +group by ep.experiment_id) as exp_subrun_sample on exp_subrun_sample.eid = ne.experiment_id +ORDER BY identifier; + +-- extracting ngs analysis + +\o gnpis_'':thematic''_ngs_analyses.csv + +SELECT DISTINCT + '"' || CAST(na.group_id as VARCHAR(3)) || '"' AS group_id, + '"Sequences analysis"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('NGS_ANALYSIS_' , na.analysis_id,'_', na.analysis_name)) AS identifier, + '"' || replace(na.analysis_name, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT(na.analysis_name, + ' is an analysis (type: ', bt.name, ') realized with the software ', s.software_name, '.', + CASE WHEN g.genome_name IS NOT NULL THEN + ' The ref. genome used is ' || g.genome_name END, + CASE WHEN t.scientific_name IS NOT NULL THEN + ' whose taxon is ' || t.scientific_name || '.' END, + CASE WHEN na.comments IS NOT NULL THEN + ' Whose comments are: ' || na.comments || '.' END, + CASE WHEN p.project_name IS NOT NULL THEN + ' The project is ' || p.project_name || '.' END + )) AS description, + QUOTE_IDENT(CONCAT(:'application_url', '/sequence/sequence/card/analysis.do?dbName=sequence&className=ngs.NgsAnalysisImpl&id=' , na.analysis_id)) AS url, + QUOTE_IDENT(CASE WHEN t.SCIENTIFIC_NAME IS NOT NULL THEN t.SCIENTIFIC_NAME ELSE '' END) AS species, + '""' AS linkedRessourcesID +FROM NGS_ANALYSIS na +JOIN BIO_TYPE bt on bt.BIO_TYPE_ID = na.analysis_type_id +JOIN SOFTWARE s on s.software_id = na.software_id +LEFT JOIN PROJECT p on p.project_id = na.project_id +LEFT JOIN GENOME g on g.genome_id = na.genome_id +LEFT JOIN TAXON t on t.taxon_id = g.taxon_id +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_synteny_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_synteny_extraction.sql new file mode 100644 index 0000000..28365d7 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_synteny_extraction.sql @@ -0,0 +1,149 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, D. Charruaud +-- Created on 2014/12/09 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ############################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: synteny +-- ############################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extracting GENE + +\o gnpis_'':thematic''_genes.csv + +-- SELECT count (DISTINCT CONCAT('SYNTENY_' , g.GENE_NAME , '_DS_' , d.DATASET_ID) ) +SELECT DISTINCT + '"' || CAST(g.group_id as VARCHAR(3)) || '"' AS group_id, + '"Gene annotation"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT(g.GENE_NAME, '_in_' , d.DATASET_NAME || ' ' || d.VERSION, '_AC_', ac.ANCESTRAL_CHROMOSOME_NAME)) AS identifier, + '"' || replace(g.GENE_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT( + CONCAT( + g.GENE_NAME , + ' is a syntenic gene from dataset: ''' , d.DATASET_NAME , '_version_', d.VERSION , ''', located on chromosome ''' , c.CHROMOSOME_NAME , + ''' of ''' , tax.scientific_name , '''' , + CASE WHEN ga.START_POSITION IS NULL THEN '.' + ELSE ' between positions ' || ga.START_POSITION || ' and ' || ga.STOP_POSITION || '.' + END, + ' It is linked to ancestral chromosome ''' , ac.ANCESTRAL_CHROMOSOME_NAME , '''.' , + CASE WHEN homolog_gene_names IS NOT NULL THEN ' It belongs to an homology group with: ' || homolog_gene_names || '.' END, + CASE WHEN qtl_names IS NOT NULL THEN + ' This gene is positioned on QTL: ' || qtl_names || '''.' + END + , + ' It is also linked to MetaQTL ' || MQTL_INFO.mqtl_trait_name || ' ' || MQTL_INFO.mqtl_name, + ' identified from following QTLs: ' || MQTL_INFO.AGG_QTL_INFOS || '.' + ) + ) AS description, + QUOTE_IDENT( + CONCAT( + :'application_url', '/synteny/synteny/viewer.do#results/', + 'datasetId=' , d.DATASET_ID , + '&geneName=' , g.GENE_NAME, + '&ancestralChromosomeId=', ac.ANCESTRAL_CHROMOSOME_ID + ) + ) AS url, + QUOTE_IDENT( + CASE WHEN d.DATASET_TYPE_ID=450 AND d.IS_CURRENT_VERSION='true' AND d.DATASET_NAME = 'Wheat' THEN -- trick for Wheat V2 dataset (dataset_id 6) which store Wheat modern genome in ancestral_gene table + concat(tax.SCIENTIFIC_NAME, ',', DISTINCT_AGG_GENE_ID.HOMOLOG_TAXON_NAMES, ',Triticum aestivum') + ELSE + concat(tax.SCIENTIFIC_NAME, ',', DISTINCT_AGG_GENE_ID.HOMOLOG_TAXON_NAMES) + END + ) AS species, + '""' AS linkedRessourcesID +FROM ( + SELECT + GENE_ID AS G_ID, DS_ID, + STRING_AGG(DISTINCT GENE_NAME, ', ') AS homolog_gene_names, + STRING_AGG(DISTINCT QTL_NAME || '(trait: '|| TRAIT_NAME || ' - ' || TRAIT_DESCRIPTION || ')', ', ') AS qtl_names, + STRING_AGG(DISTINCT HOMOLOG_SCIENTIFIC_NAME, ',') AS HOMOLOG_TAXON_NAMES + FROM ( + SELECT DISTINCT + g1.GENE_ID AS GENE_ID, + hg.DATASET_ID AS DS_ID, + g2.GENE_NAME AS GENE_NAME, -- gene name of g1's homologs + q.QTL_NAME AS QTL_NAME, + t.TRAIT_NAME AS TRAIT_NAME, + t.DESCRIPTION AS TRAIT_DESCRIPTION, + homolog_taxons.SCIENTIFIC_NAME AS HOMOLOG_SCIENTIFIC_NAME + FROM GENE g1 + JOIN GENE_HOMOLOGY_GROUP ghg ON ghg.GENE_ID = g1.GENE_ID + JOIN HOMOLOGY_GROUP hg ON hg.HOMOLOGY_GROUP_ID= ghg.HOMOLOGY_GROUP_ID + JOIN DATASET d ON d.DATASET_ID=HG.DATASET_ID + LEFT JOIN GENE_HOMOLOGY_GROUP ghg2 ON ghg2.HOMOLOGY_GROUP_ID = hg.HOMOLOGY_GROUP_ID -- some genes are alone in their group => left join! + LEFT JOIN GENE g2 ON g2.GENE_ID = ghg2.GENE_ID AND g1.GENE_ID <> g2.GENE_ID + LEFT JOIN GENE_ASSIGNMENT ga2 ON ga2.GENE_ID = g2.GENE_ID AND ga2.DATASET_ID = hg.DATASET_ID + LEFT JOIN REF_SEQ rf ON rf.ref_seq_id=ga2.ref_seq_id + LEFT JOIN CHROMOSOME c ON c.CHROMOSOME_ID = rf.CHROMOSOME_ID + LEFT JOIN TAXON homolog_taxons ON homolog_taxons.TAXON_ID = c.TAXON_ID + JOIN GENE_ASSIGNMENT ga ON ga.GENE_ID = g1.GENE_ID AND ga.DATASET_ID = hg.DATASET_ID + LEFT JOIN GENE_QTL gq ON gq.GENE_ASSIGNMENT_ID = ga.GENE_ASSIGNMENT_ID + LEFT JOIN QTL q ON q.MAPPABLE_ELEMT_id = gq.MAPPABLE_ELEMT_id + LEFT JOIN QTL_DETECTION qd ON q.QTL_DETEC_ID = qd.QTL_DETEC_ID + LEFT JOIN MEASURE m ON qd.MEASURE_ID = m.MEASURE_ID + LEFT JOIN TRAIT t ON m.TRAIT_ID = t.TRAIT_ID + -- WHERE d.DATASET_ID=6 -- use restriction for test purpose only + WHERE d.IS_CURRENT_VERSION='true' + AND d.DATASET_TYPE_ID=450 + ORDER BY GENE_NAME + ) AS DISTINCT_GENE_ID + GROUP BY DISTINCT_GENE_ID.GENE_ID, DISTINCT_GENE_ID.DS_ID +) AS DISTINCT_AGG_GENE_ID + JOIN GENE g ON g.GENE_ID = DISTINCT_AGG_GENE_ID.G_ID + JOIN GENE_ASSIGNMENT ga ON ga.GENE_ID = g.GENE_ID and ga.DATASET_ID = DISTINCT_AGG_GENE_ID.DS_ID + JOIN REF_SEQ rf ON rf.ref_seq_id=ga.ref_seq_id + JOIN CHROMOSOME c ON c.CHROMOSOME_ID = rf.CHROMOSOME_ID + JOIN TAXON tax ON tax.TAXON_ID = c.TAXON_ID + LEFT JOIN GENE_HOMOLOGY_GROUP ghg ON ghg.GENE_ID = g.GENE_ID + JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = ghg.BIO_TYPE_ID + LEFT JOIN HOMOLOGY_GROUP hg ON hg.HOMOLOGY_GROUP_ID = ghg.HOMOLOGY_GROUP_ID and hg.DATASET_ID = DISTINCT_AGG_GENE_ID.DS_ID + JOIN DATASET d ON d.DATASET_ID = hg.DATASET_ID + LEFT JOIN ANCESTRAL_GENE ag ON ag.ANCESTRAL_GENE_ID = hg.ANCESTRAL_GENE_ID + LEFT JOIN ANCESTRAL_CHROMOSOME ac ON ac.ANCESTRAL_CHROMOSOME_ID = ag.ANCESTRAL_CHROMOSOME_ID + LEFT JOIN ( + SELECT DISTINCT + mqtl.ANCESTRAL_CHROMOSOME_ID AS ANCESTRAL_CHROMOSOME_ID, + syntenome_left.RELATIVE_POSITION AS SYNTENOME_LEFT_POSITION, + syntenome_right.RELATIVE_POSITION AS SYNTENOME_RIGHT_POSITION, + mqtl.MQTL_NAME AS MQTL_NAME, + mqtl.MQTL_TRAIT_NAME AS MQTL_TRAIT_NAME, + QTL_INFOS_GROUPED.QTL_INFOS_AGGREGATED AS AGG_QTL_INFOS + FROM mqtl + JOIN ancestral_gene syntenome_left ON syntenome_left.ANCESTRAL_GENE_ID = mqtl.SYNTENOME_LEFT_ID + JOIN ancestral_gene syntenome_right ON syntenome_right.ANCESTRAL_GENE_ID = mqtl.SYNTENOME_RIGHT_ID + JOIN ( + SELECT + STRING_AGG(A_QTL_INFOS,', ') AS QTL_INFOS_AGGREGATED, + MQTL_ID AS MQTL_ID + FROM ( + SELECT DISTINCT + qi.MQTL_ID AS MQTL_ID, + CONCAT ( + qi.QTL_NAME, + ' is a QTL for trait ', qi.TRAIT_NAME, + ' referenced in publication ''', qi.PUBLICATION, + ''' from research station ', qi.RESEARCH_STATION,' (',qi.COUNTRY,') in ', qi.YEAR, + ' found from a ', qi.POPULATION_TYPE, ' population involving ', qi.P1,' and ', qi.P2 + ) AS A_QTL_INFOS + FROM QTL_INFOS qi + ) AS QTL_INFOS_AGGREGATION + GROUP BY QTL_INFOS_AGGREGATION.MQTL_ID + ) AS QTL_INFOS_GROUPED ON QTL_INFOS_GROUPED.MQTL_ID = mqtl.MQTL_ID + ) AS MQTL_INFO on + MQTL_INFO.SYNTENOME_LEFT_POSITION >= ag.RELATIVE_POSITION + AND MQTL_INFO.SYNTENOME_RIGHT_POSITION <= ag.RELATIVE_POSITION + AND MQTL_INFO.ANCESTRAL_CHROMOSOME_ID = ag.ANCESTRAL_CHROMOSOME_ID +-- WHERE d.DATASET_ID=6 -- use restriction for test purpose only +WHERE d.IS_CURRENT_VERSION='true' +AND d.DATASET_TYPE_ID=450 +ORDER BY identifier +; diff --git a/etl_gnpis-core_dd/sql/transplant_gnpis_transcriptome_extraction.sql b/etl_gnpis-core_dd/sql/transplant_gnpis_transcriptome_extraction.sql new file mode 100644 index 0000000..b1bc442 --- /dev/null +++ b/etl_gnpis-core_dd/sql/transplant_gnpis_transcriptome_extraction.sql @@ -0,0 +1,163 @@ +-- #################################################################### +-- Copyright (C) 2014 INRA-URGI +-- Author(s): E. Kimmel, D. Charruaud +-- Created on 2014/12/09 +-- Contact: urgi-contact@versailles.inrae.fr +-- It is strictly forbidden to transfer, use or re-use this code +-- or part of it without explicit written authorization from INRA-URGI. +-- #################################################################### + +-- ##################################################################################### +-- SQL script used to extract data for transPLANT indices, gnpis thematic: transcriptome +-- ##################################################################################### + +\pset format unaligned +\pset tuples_only +\pset fieldsep , + +-- extracting EXPERIMENT + +\o gnpis_'':thematic''_experiments.csv + +SELECT DISTINCT + '"' || CAST(exp.group_id as VARCHAR(3)) || '"' AS group_id, + '"Transcriptomic experiment"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('TRANSCRIPTOMIC_EXPERIMENT_' , exp.EXPERIMENT_ID,'_1')) AS identifier, + '"' || replace(exp.EXPERIMENT_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT(exp.EXPERIMENT_NAME, + ' is an experiment (type: ' , bt.NAME , ')' , + ' using samples ', SAMPLE_NAMES, ' of species ' , scientific_names , '.' , + ' This experiment belongs to the scientific project ', pr.PROJECT_CODE, '. ', + exp.DESCRIPTION) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url', '/GnpArray/transcriptome/id.do?action=EXPERIMENT&id=' , exp.EXPERIMENT_ID )) AS url, + QUOTE_IDENT(scientific_names) AS species, + '""' AS linkedRessourcesID +FROM ( + SELECT EXPERIMENT_ID AS e_id, + STRING_AGG(distinct(SCIENTIFIC_NAME), ',') AS scientific_names, + string_agg(distinct(SAMPLE_NAME), ', ') AS SAMPLE_NAMES + FROM ( + SELECT DISTINCT exp.EXPERIMENT_ID AS EXPERIMENT_ID, + tax.SCIENTIFIC_NAME AS SCIENTIFIC_NAME, + s.SAMPLE_NAME AS SAMPLE_NAME + FROM EXPERIMENT exp + JOIN EXP_HYBR eh ON eh.EXPERIMENT_ID = exp.EXPERIMENT_ID + JOIN HYBRIDIZATION h ON h.HYBRIDIZATION_ID = eh.HYBRIDIZATION_ID + JOIN HYBR_LABELED_EXTRACT hle ON hle.HYBRIDIZATION_ID = h.HYBRIDIZATION_ID + JOIN LABELED_EXTRACT le ON le.LABELED_EXTRACT_ID = hle.LABELED_EXTRACT_ID + JOIN EXTRACT_LABELED_EXTRACT ele ON ele.LABELED_EXTRACT_ID = le.LABELED_EXTRACT_ID + JOIN EXTRACT e ON e.EXTRACT_ID = ele.EXTRACT_ID + JOIN SAMPLE_EXTRACT se ON se.EXTRACT_ID = e.EXTRACT_ID + JOIN SAMPLE s ON s.SAMPLE_ID = se.SAMPLE_ID + JOIN SAMPLE_SOURCE ss ON ss.SAMPLE_SOURCE_ID = s.SAMPLE_SOURCE_ID + JOIN TAXON tax ON tax.TAXON_ID = ss.TAXON_ID) AS DISTINCT_EXPERIMENT_ID + GROUP BY DISTINCT_EXPERIMENT_ID.EXPERIMENT_ID) AS DISTINCT_AGG_EXPERIMENT_ID +JOIN EXPERIMENT exp ON exp.EXPERIMENT_ID = DISTINCT_AGG_EXPERIMENT_ID.e_id +LEFT JOIN PROJECT pr ON pr.PROJECT_ID = exp.PROJECT_ID +JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = exp.BIO_TYPE_ID +ORDER BY identifier; + + +-- extracting GENE + +\o gnpis_'':thematic''_genes.csv + +SELECT DISTINCT + '"' || CAST(g.group_id as VARCHAR(3)) || '"' AS group_id, + '"Gene annotation"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('TRANSCRIPTOMIC_GENE_' , g.GENE_ID,'_1')) AS identifier, + '"' || replace(g.GENE_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT(g.GENE_NAME , + ' is a gene involved in a transcriptomic experiments having expression level ''' , REGULATIONS , '''' , + ' in the gene lists ''' , GENE_LISTS , '''.' , + ' This experiment used samples ', SAMPLE_NAMES, ' of species ' , scientific_names , '.' , + ' This gene list belongs to the scientific project ''', PROJECTS, '''.') + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url', '/GnpArray/transcriptome/card.do?&dbName=common&className=GeneImpl&id=' , g.GENE_ID)) AS url, + QUOTE_IDENT(scientific_names) AS species, + '""' AS linkedRessourcesID +FROM ( + SELECT GENE_ID AS g_id, + STRING_AGG(distinct(SCIENTIFIC_NAME), ',') AS scientific_names, + string_agg(distinct(SAMPLE_NAME), ', ') AS SAMPLE_NAMES, + string_agg(distinct(GENE_LIST_NAME), ' , ') AS GENE_LISTS, + string_agg(distinct(PROJECTS), ' , ') AS PROJECTS, + string_agg(distinct(REGULATION), ' , ') AS REGULATIONS + FROM ( + SELECT DISTINCT g.GENE_ID AS GENE_ID, + tax.SCIENTIFIC_NAME AS SCIENTIFIC_NAME, + s.SAMPLE_NAME AS SAMPLE_NAME, + gl.GENE_LIST_NAME AS GENE_LIST_NAME, + p.PROJECT_CODE AS PROJECTS, + bt.name AS REGULATION + FROM GENE g + JOIN GENE_GENE_LIST ggl ON ggl.GENE_ID = g.GENE_ID + JOIN GENE_LIST gl ON gl.GENE_LIST_ID = ggl.GENE_LIST_ID + JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = gl.BIO_TYPE_ID + JOIN PROJECT p ON p.PROJECT_ID = gl.PROJECT_ID + JOIN EXPERIMENT exp ON exp.PROJECT_ID = p.PROJECT_ID + JOIN EXP_HYBR eh ON eh.EXPERIMENT_ID = exp.EXPERIMENT_ID + JOIN HYBRIDIZATION h ON h.HYBRIDIZATION_ID = eh.HYBRIDIZATION_ID + JOIN HYBR_LABELED_EXTRACT hle ON hle.HYBRIDIZATION_ID = h.HYBRIDIZATION_ID + JOIN LABELED_EXTRACT le ON le.LABELED_EXTRACT_ID = hle.LABELED_EXTRACT_ID + JOIN EXTRACT_LABELED_EXTRACT ele ON ele.LABELED_EXTRACT_ID = le.LABELED_EXTRACT_ID + JOIN EXTRACT e ON e.EXTRACT_ID = ele.EXTRACT_ID + JOIN SAMPLE_EXTRACT se ON se.EXTRACT_ID = e.EXTRACT_ID + JOIN SAMPLE s ON s.SAMPLE_ID = se.SAMPLE_ID + JOIN SAMPLE_SOURCE ss ON ss.SAMPLE_SOURCE_ID = s.SAMPLE_SOURCE_ID + JOIN TAXON tax ON tax.TAXON_ID = ss.TAXON_ID) AS DISTINCT_GENE_ID + GROUP BY DISTINCT_GENE_ID.GENE_ID) AS DISTINCT_AGG_GENE_ID +JOIN GENE g ON g.GENE_ID = DISTINCT_AGG_GENE_ID.g_id +JOIN GENE_GENE_LIST ggl ON ggl.GENE_ID = g.GENE_ID +JOIN GENE_LIST gl ON gl.GENE_LIST_ID = ggl.GENE_LIST_ID +JOIN PROJECT p ON p.PROJECT_ID = gl.PROJECT_ID +JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = gl.BIO_TYPE_ID +ORDER BY identifier; + + +-- extracting GENE_LIST + +\o gnpis_'':thematic''_gene_lists.csv + +select distinct + '"' || CAST(gl.group_id as VARCHAR(3)) || '"' AS group_id, + '"Transcriptomic gene list"' AS entry_type, + '"' ||:'source_name'|| '"' AS database_name, + QUOTE_IDENT(CONCAT('TRANSCRIPTOMIC_GENE_LIST_' , gl.GENE_LIST_ID,'_1')) AS identifier, + '"' || replace(gl.GENE_LIST_NAME, '"', '''') ||'"' AS name, + QUOTE_IDENT(CONCAT(gl.GENE_LIST_NAME , + ' is a gene list produced in a transcriptomic experiment and for which the expression level of the genes is ''' , bt.name , '''. ' , + CASE WHEN p.PROJECT_ID IS NULL THEN '' + ELSE ' This gene list belongs to the scientific project ''' || p.PROJECT_CODE || '''. ' END , + gl.DESCRIPTION) + ) AS description, + QUOTE_IDENT(CONCAT(:'application_url', '/GnpArray/transcriptome/geneListAction.do?content=all&method=details&geneListId=' , gl.GENE_LIST_ID)) AS url, + QUOTE_IDENT(scientific_names) AS species, + '""' AS linkedRessourcesID +FROM ( + SELECT GENE_LIST_ID AS gl_id, + STRING_AGG(distinct(SCIENTIFIC_NAME), ',') AS scientific_names + FROM ( + SELECT DISTINCT gl.GENE_LIST_ID AS GENE_LIST_ID, + tax.SCIENTIFIC_NAME AS SCIENTIFIC_NAME + FROM GENE_LIST gl + JOIN PROJECT p ON p.PROJECT_ID = gl.PROJECT_ID + JOIN EXPERIMENT exp ON exp.PROJECT_ID = p.PROJECT_ID + JOIN EXP_HYBR eh ON eh.EXPERIMENT_ID = exp.EXPERIMENT_ID + JOIN HYBRIDIZATION h ON h.HYBRIDIZATION_ID = eh.HYBRIDIZATION_ID + JOIN HYBR_LABELED_EXTRACT hle ON hle.HYBRIDIZATION_ID = h.HYBRIDIZATION_ID + JOIN LABELED_EXTRACT le ON le.LABELED_EXTRACT_ID = hle.LABELED_EXTRACT_ID + JOIN EXTRACT_LABELED_EXTRACT ele ON ele.LABELED_EXTRACT_ID = le.LABELED_EXTRACT_ID + JOIN EXTRACT e ON e.EXTRACT_ID = ele.EXTRACT_ID + JOIN SAMPLE_EXTRACT se ON se.EXTRACT_ID = e.EXTRACT_ID + JOIN SAMPLE s ON s.SAMPLE_ID = se.SAMPLE_ID + JOIN SAMPLE_SOURCE ss ON ss.SAMPLE_SOURCE_ID = s.SAMPLE_SOURCE_ID + JOIN TAXON tax ON tax.TAXON_ID = ss.TAXON_ID) AS DISTINCT_GENE_LIST_ID + GROUP BY DISTINCT_GENE_LIST_ID.GENE_LIST_ID) AS DISTINCT_AGG_GENE_LIST_ID +JOIN GENE_LIST gl ON gl.GENE_LIST_ID = DISTINCT_AGG_GENE_LIST_ID.gl_id +JOIN PROJECT p ON p.PROJECT_ID = gl.PROJECT_ID +JOIN BIO_TYPE bt ON bt.BIO_TYPE_ID = gl.BIO_TYPE_ID +ORDER BY identifier; diff --git a/etl_gnpis-core_dd/variables_enrichment.sh b/etl_gnpis-core_dd/variables_enrichment.sh new file mode 100755 index 0000000..8c0cc0d --- /dev/null +++ b/etl_gnpis-core_dd/variables_enrichment.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# +# variables_enrichment.sh +# +# Author: F. PHILIPPE, R. FLORES, C. MICHOTEY +# +# Copyright INRAE-URGI 2017-2021 +# + +check_jq() { + which jq > /dev/null + [ $? -ne 0 ] && echo "jq is not installed on this server, or not specified in PATH: " && echo "$PATH" && echo "Please install it via package manager or via tarball: https://stedolan.github.io/jq/download/. Exiting." && exit 1 + + JQ_VERSION=$(jq --version) + { [ "${JQ_VERSION}" != "jq-1.5" ] && [ "${JQ_VERSION}" != "jq-1.6" ] ; } && echo "jq version expected is jq-1.5 or above. Please use these versions only. Exiting." && exit 1 +} + +check_jq + +TMP_DIR=$(mktemp -d) +SCRIPT_DIR=$(readlink -f $(dirname $0)) +DEBUG=0 +VERBOSE=0 +PAGE_SIZE=1000 +WS_BASE_URL="https://urgi.versailles.inrae.fr/ws/webresources" +## Help display +############### + +usage() { + cat <<EOF + +Script used to enrich any ontology term with its name and synonym from input file. + +USAGE: ${0} -f <CSV_FILE> \ +[-v|-vv] \ +[-h|--help] + +PARAMS: + -f the input file to enrich + -ws URL base of BrAPI webservices. Default: ${WS_BASE_URL} + -v display verbose informations + -vv display very verbose informations + --debug do not remove intermediate files for debuging purpose + -h or --help print this help + +EXAMPLES: + + +EOF + exit 1 +} + +# [ $# -eq 0 ] && usage && exit 0 # exit after displaying help if no argument given + +## Get commandline params +######################### + +# get params +while [ -n "$1" ]; do + case $1 in + -h) usage;shift 1;; + --help) usage;shift 1;; + -f) CSV_FILE="$2";shift 2;; + -ws) WS_BASE_URL="$2";shift 2;; + -v) VERBOSE=1;shift 1;; + -vv) VERBOSE=2;shift 1;; + --debug) DEBUG=1;shift 1;; + --) shift;break;; + -*) echo && echo "Unknown option: $1" && echo;exit 1;; + *) break;; + esac +done + +[ $VERBOSE -ge 1 ] && echo "Using temp dir: $TMP_DIR" +[ $VERBOSE -ge 2 ] && PARALLEL_VERBOSE="--bar" + +[ -z "${CSV_FILE}" ] && echo "ERROR: missing input file. Exiting." && usage && exit 1 +TMP_OUTPUT_FILE="$(basename "${CSV_FILE}" .csv)_enriched.csv" + +export TMP_DIR + +fetch_observation_variables(){ + local currentPage=0 + [ -n "$1" ] && currentPage=$1 # $1 is optional parameter only used in recursion + [ $VERBOSE -ge 1 ] && [ $currentPage == 0 ] && echo "Fetching observation variables..." + [ $VERBOSE -ge 2 ] && echo "Process page $currentPage with size $PAGE_SIZE " + local CURL_CMD="curl -Ss -XGET '${WS_BASE_URL}/brapi/v1/variables?page=${currentPage}&pageSize=$PAGE_SIZE' | jq '.' > ${TMP_DIR}/observation_variables_list_${currentPage}.json" + [ $VERBOSE -ge 2 ] && echo "Executing cmd: $CURL_CMD" + eval "${CURL_CMD}" + totalPages=$(jq --raw-output '.metadata.pagination.totalPages' "${TMP_DIR}/observation_variables_list_${currentPage}.json" ) + local nextPage=$((1 + currentPage)) + if [ ${nextPage} -lt $totalPages ]; then + fetch_observation_variables $nextPage + fi + # once all page are fetched, merge data: + if [ ${currentPage} -eq 0 ]; then + jq '.result.data[] ' "${TMP_DIR}"/observation_variables_list_*.json | jq -s . > "${TMP_DIR}/observation_variables_list.json" + fi +} + +generate_key_value_csv(){ + [ $VERBOSE -ge 1 ] && echo "Generate key-value CSV file..." + jq --raw-output -f "${SCRIPT_DIR}/extract_observation_variables.jq" "${TMP_DIR}/observation_variables_list.json" > "${TMP_DIR}/value_list.csv" +} + +emit_sed_command(){ + line="$1" + key=$(echo $line | cut -f1 -d,) + value=$(echo $line | cut -f2- -d,) + printf '%s' "s#$key#$value#;" +} +export -f emit_sed_command + +create_sed_command() { + [ $VERBOSE -ge 1 ] && echo "Create sed command..." + parallel ${PARALLEL_VERBOSE} -k emit_sed_command :::: "${TMP_DIR}"/value_list.csv | cat - > "${TMP_DIR}"/enrich.sed +} + +process_line() { + line="$1" + echo "$line" | sed -f "${TMP_DIR}"/enrich.sed +} +export -f process_line + +enrich_csv(){ + [ $VERBOSE -ge 1 ] && echo "Enriching data" + # TODO: use ${CSV_FILE} from data_disscovery dir to avoid XREF oversize provoking a "Command line too long" error, because of an input line too long: + # 57% 1437:1055=13s "0","Phenotyping study","GnpIS","TRIAL_2322_0989010000","OrtetsCormiers","0989010000 is a trial lead at site: Paris - 0989010000 (lat/long: 48.853/2.3486), described as 'OrtetsCormiers'. Observation variables: CO_357:0000019 parallel: Error: Command line too long (408206 >= 131049) at input -1041: "0","Phenotyping study","GnpIS","TRIAL_2323_G5_25"... + parallel ${PARALLEL_VERBOSE} -k process_line :::: "${CSV_FILE}" > "${TMP_DIR}/${TMP_OUTPUT_FILE}" +} + +clean() { + [ $VERBOSE -ge 1 ] && echo "Cleaning temporary files" + [ $VERBOSE -ge 2 ] && RM_OPTION="-v" + rm -rf "${RM_OPTION}" "${TMP_DIR}" +} + +main(){ + fetch_observation_variables + generate_key_value_csv + create_sed_command + enrich_csv + mv -f "${TMP_DIR}/${TMP_OUTPUT_FILE}" "${CSV_FILE}" + echo "Enriched file is located at: ${CSV_FILE}" + if [ $DEBUG -eq 0 ]; then + clean + fi +} + +main +exit 0 -- GitLab From 46649119bda56c402dea37a4daa61bacfccb2b95 Mon Sep 17 00:00:00 2001 From: Celia Michotey <celia.michotey@inra.fr> Date: Thu, 13 Apr 2023 17:28:16 +0200 Subject: [PATCH 3/3] Remove linkedResourcesID and group 0 filter for DD, correct bugs --- etl_gnpis-core_dd/extract_gnpis-core.sh | 23 ++++++++++++++--------- etl_gnpis-core_dd/map_values_to_json.jq | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/etl_gnpis-core_dd/extract_gnpis-core.sh b/etl_gnpis-core_dd/extract_gnpis-core.sh index 81f0bb6..3d48308 100755 --- a/etl_gnpis-core_dd/extract_gnpis-core.sh +++ b/etl_gnpis-core_dd/extract_gnpis-core.sh @@ -244,24 +244,30 @@ check_extracted_data() { } export -f check_extracted_data -enrich_csv(){ +cut_enrich_csv(){ CSV_FILE="$1" + NEW_CSV_FILE="${OUTPUT}/data_discovery/$(basename "$CSV_FILE")" CURRENT_THEMATIC="$2" + [ $VERBOSE -ge 2 ] && echo "Cut CSV file..." + [ $VERBOSE -ge 2 ] && echo "processing with file : ${CSV_FILE}..." + # keeping DD columns with groupId but removing linkedResourcesID + cut -f 1-8 "${CSV_FILE}" > "${NEW_CSV_FILE}" + if [ "${CURRENT_THEMATIC}" == "phenotyping" ]; then - [ $VERBOSE -ge 1 ] && echo "Enrich variables..." - [ $VERBOSE -ge 2 ] && echo "processing with file : ${CSV_FILE}..." - "${CURRENT_DIR}"/variables_enrichment.sh -f "${CSV_FILE}" "${VERBOSE_OPTION}" + [ $VERBOSE -ge 2 ] && echo "Enrich variables..." + [ $VERBOSE -ge 2 ] && echo "processing with file : ${NEW_CSV_FILE}..." + "${CURRENT_DIR}"/variables_enrichment.sh -f "${NEW_CSV_FILE}" "${VERBOSE_OPTION}" fi } -export -f enrich_csv +export -f cut_enrich_csv transform_private_url(){ # change URL to their private form when first field does not start by a zero (2 pass: 1 => gnpis legacy ; 2 => faidare) # transformed files are written into `${OUTPUT}/privatised` sub-directory CSV_FILE="$1" - if [[ ! "${FAIDARE_URL}" =~ "https://urgi.versailles.inrae.fr/faidare-" ]] ; then # we have a production URL + if [[ "${FAIDARE_URL}" = "https://urgi.versailles.inrae.fr/faidare" ]] ; then # we have a production URL sed 's/\t/ /g ; s/^"//g ; s/","/\t/g ; s/"$//g' "${CSV_FILE}" | sed -r "s#^([^0].*)(https://urgi.versailles.inrae.fr)(.*)#\1\2/private\3#g ; s#^([^0].*)(https://urgi.versailles.inrae.fr/private/faidare)(.*)#\1https://urgi.versailles.inrae.fr/faidare-private\3#g" > "${OUTPUT}/privatised/$(basename "$CSV_FILE")" else sed 's/\t/ /g ; s/^"//g ; s/","/\t/g ; s/"$//g' "${CSV_FILE}" > "${OUTPUT}/privatised/$(basename "$CSV_FILE")" @@ -282,8 +288,7 @@ dd_convert_csv_to_json() { fi if [ -n "$(ls -1 ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv)" ]; then - cp ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv ${OUTPUT}/data_discovery/ - parallel ${PARALLEL_VERBOSE_2} enrich_csv "{}" "${LOCAL_THEMATIC}" ::: "${OUTPUT}/data_discovery/gnpis_${LOCAL_THEMATIC}*.csv" + parallel ${PARALLEL_VERBOSE_2} cut_enrich_csv "{}" "${LOCAL_THEMATIC}" ::: ${OUTPUT}/privatised/gnpis_${LOCAL_THEMATIC}*.csv else [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No CSV file matching gnpis_${LOCAL_THEMATIC}*.csv found...${NC}" continue @@ -367,7 +372,7 @@ echo -e "\n${BOLD}Manage private data...${NC}" [ ! -d "${OUTPUT}/privatised" ] && mkdir "${OUTPUT}/privatised" if [ -n "$(ls -1 ${OUTPUT}/gnpis_*.csv)" ]; then [ $VERBOSE -ge 1 ] && echo "Transform private URL..." - parallel ${PARALLEL_VERBOSE_2} transform_private_url ::: "${OUTPUT}/gnpis_"*.csv + parallel ${PARALLEL_VERBOSE_2} transform_private_url ::: ${OUTPUT}/gnpis_*.csv else [ $VERBOSE -ge 1 ] && echo -e "${ORANGE}No CSV file matching gnpis_*.csv found...${NC}" fi diff --git a/etl_gnpis-core_dd/map_values_to_json.jq b/etl_gnpis-core_dd/map_values_to_json.jq index f457189..eeb800e 100644 --- a/etl_gnpis-core_dd/map_values_to_json.jq +++ b/etl_gnpis-core_dd/map_values_to_json.jq @@ -12,7 +12,7 @@ split("\n") | .[] "name": .[4], "description": .[5], "url": .[6], - "species": .[7]|tostring|split("%2C "), # TODO: check that this split is done correctly, I doubt... + "species": .[7]|tostring|split(", "), # TODO: check that this split is done correctly, I doubt... "linkedResourcesID": ([ foreach (.[8]? | tostring | split(", ")[]) as $pui -- GitLab