awk script to extract transcript information from gff3 file

I need help to extract transcript information from gff3 file.
Here is the input

Chr01	JGI	gene	82773	86941	.	-	.	ID=Potri.001G000900;Name=Potri.001G000900
Chr01	JGI	mRNA	82793	86530	.	-	.	ID=PAC:27047814;Name=Potri.001G000900.1;pacid=27047814;longest=1;Parent=Potri.001G000900
Chr01	JGI	exon	86331	86530	.	-	.	ID=PAC:27047814.exon.1;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	86331	86530	.	-	0	ID=PAC:27047814.CDS.1;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	85729	85816	.	-	.	ID=PAC:27047814.exon.2;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	85729	85816	.	-	1	ID=PAC:27047814.CDS.2;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	85531	85590	.	-	.	ID=PAC:27047814.exon.3;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	85531	85590	.	-	0	ID=PAC:27047814.CDS.3;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	85162	85224	.	-	.	ID=PAC:27047814.exon.4;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	85162	85224	.	-	0	ID=PAC:27047814.CDS.4;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	84838	85020	.	-	.	ID=PAC:27047814.exon.5;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	84838	85020	.	-	0	ID=PAC:27047814.CDS.5;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	84635	84746	.	-	.	ID=PAC:27047814.exon.6;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	84635	84746	.	-	0	ID=PAC:27047814.CDS.6;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	84304	84521	.	-	.	ID=PAC:27047814.exon.7;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	84304	84521	.	-	2	ID=PAC:27047814.CDS.7;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	exon	82793	83260	.	-	.	ID=PAC:27047814.exon.8;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	three_prime_UTR	82793	83167	.	-	.	ID=PAC:27047814.three_prime_UTR.1;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	CDS	83168	83260	.	-	0	ID=PAC:27047814.CDS.8;Parent=PAC:27047814;pacid=27047814
Chr01	JGI	mRNA	82773	86941	.	-	.	ID=PAC:27047815;Name=Potri.001G000900.2;pacid=27047815;longest=0;Parent=Potri.001G000900
Chr01	JGI	exon	86686	86941	.	-	.	ID=PAC:27047815.exon.1;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	five_prime_UTR	86686	86941	.	-	.	ID=PAC:27047815.five_prime_UTR.1;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	86331	86489	.	-	.	ID=PAC:27047815.exon.2;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	86331	86470	.	-	0	ID=PAC:27047815.CDS.1;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	five_prime_UTR	86471	86489	.	-	.	ID=PAC:27047815.five_prime_UTR.2;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	85729	85816	.	-	.	ID=PAC:27047815.exon.3;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	85729	85816	.	-	1	ID=PAC:27047815.CDS.2;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	85531	85590	.	-	.	ID=PAC:27047815.exon.4;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	85531	85590	.	-	0	ID=PAC:27047815.CDS.3;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	85162	85224	.	-	.	ID=PAC:27047815.exon.5;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	85162	85224	.	-	0	ID=PAC:27047815.CDS.4;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	84838	85035	.	-	.	ID=PAC:27047815.exon.6;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	84838	85035	.	-	0	ID=PAC:27047815.CDS.5;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	84635	84746	.	-	.	ID=PAC:27047815.exon.7;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	84635	84746	.	-	0	ID=PAC:27047815.CDS.6;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	84304	84521	.	-	.	ID=PAC:27047815.exon.8;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	84304	84521	.	-	2	ID=PAC:27047815.CDS.7;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	exon	82773	83260	.	-	.	ID=PAC:27047815.exon.9;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	three_prime_UTR	82773	83167	.	-	.	ID=PAC:27047815.three_prime_UTR.1;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	CDS	83168	83260	.	-	0	ID=PAC:27047815.CDS.8;Parent=PAC:27047815;pacid=27047815
Chr01	JGI	gene	95641	101115	.	+	.	ID=Potri.001G001200;Name=Potri.001G001200
Chr01	JGI	tRNA	95641	100989	.	+	.	ID=PAC:27041679;Name=Potri.001G001200.2;pacid=27041679;longest=0;Parent=Potri.001G001200
Chr01	JGI	exon	95641	95818	.	+	.	ID=PAC:27041679.exon.1;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	95641	95818	.	+	0	ID=PAC:27041679.CDS.1;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	96385	96554	.	+	.	ID=PAC:27041679.exon.2;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	96385	96554	.	+	2	ID=PAC:27041679.CDS.2;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	97086	97143	.	+	.	ID=PAC:27041679.exon.3;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	97086	97143	.	+	0	ID=PAC:27041679.CDS.3;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	97438	97571	.	+	.	ID=PAC:27041679.exon.4;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	97438	97571	.	+	2	ID=PAC:27041679.CDS.4;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	97644	97768	.	+	.	ID=PAC:27041679.exon.5;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	97644	97768	.	+	0	ID=PAC:27041679.CDS.5;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	97920	98095	.	+	.	ID=PAC:27041679.exon.6;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	97920	98095	.	+	1	ID=PAC:27041679.CDS.6;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	98894	99082	.	+	.	ID=PAC:27041679.exon.7;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	98894	99082	.	+	2	ID=PAC:27041679.CDS.7;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	99193	100456	.	+	.	ID=PAC:27041679.exon.8;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	CDS	99193	100070	.	+	2	ID=PAC:27041679.CDS.8;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	three_prime_UTR	100071	100456	.	+	.	ID=PAC:27041679.three_prime_UTR.1;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	100508	100734	.	+	.	ID=PAC:27041679.exon.9;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	three_prime_UTR	100508	100734	.	+	.	ID=PAC:27041679.three_prime_UTR.2;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	exon	100874	100989	.	+	.	ID=PAC:27041679.exon.10;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	three_prime_UTR	100874	100989	.	+	.	ID=PAC:27041679.three_prime_UTR.3;Parent=PAC:27041679;pacid=27041679
Chr01	JGI	tRNA	95641	101115	.	+	.	ID=PAC:27041680;Name=Potri.001G001200.1;pacid=27041680;longest=1;Parent=Potri.001G001200
Chr01	JGI	exon	95641	95818	.	+	.	ID=PAC:27041680.exon.1;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	CDS	95641	95818	.	+	0	ID=PAC:27041680.CDS.1;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	exon	96385	96554	.	+	.	ID=PAC:27041680.exon.2;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	CDS	96385	96554	.	+	2	ID=PAC:27041680.CDS.2;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	exon	97086	97143	.	+	.	ID=PAC:27041680.exon.3;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	CDS	97086	97143	.	+	0	ID=PAC:27041680.CDS.3;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	exon	97438	97571	.	+	.	ID=PAC:27041680.exon.4;Parent=PAC:27041680;pacid=27041680
Chr01	JGI	CDS	97438	97571	.	+	2	ID=PAC:27041680.CDS.4;Parent=PAC:27041680;pacid=27041680JGI

Here is the output

transcript_id		gene_name		description	chromosome	strand	transcript_start	transcript_end	gene_start	gene_end
Potri.001G000900.1	Potri.001G000900	desc	Chr01	-	82793	86530	82773	86941
Potri.001G000900.2	Potri.001G000900	desc	Chr01	-	82773	86941	82773	86941
Potri.001G001200.2	Potri.001G001200	desc	Chr01	+	95641	100989	95641	101115
Potri.001G001200.1	Potri.001G001200	desc	Chr01	+	95641	101115	95641	101115

I have been trying to get this output for many months but I still couldn't find a good solution. I appreciate all your effort and help.
Thank you in advanced.

I came up with this question but that is not correct.

awk '{if(g3=="mRNA"){split($9,a,"=");split(a[2],b,";");split(g9,ga,"=");split(ga[2],gb,";");print b[1]"\t"gb[1]"\tDesc\t"$1"\t"$7"\t"g4"\t"g5"\tPAC\tPEP\t"$4"\t"$5};g3=$3;g1=$1;g2=$2;g4=$4;g5=$5;g9=$9}'
awk '
(! h++) {print "transcript_id", "gene_name", "description", "chromosome", "strand", "transcript_start", "transcript_end", "gene_start", "gene_end";}
$9 ~ /ID=.*Name=/ {split($9,a,";");split(a[2], b,"="); gs[b[2]]=$4; ge[b[2]]=$5;}
$3~/.RNA/ {
split($9,a,";");
split(a[2],b,"=");
split(a[5],c,"=");
print b[2], c[2], "Desc", $1, $7, $4, $5, gs[c[2]], ge[c[2]];
}
' OFS="\t" input
1 Like

Private IDs

awk '
(! h++) {print "transcript_id", "gene_name", "description", "chromosome", "strand", "transcript_start", "transcript_end", "gene_start", "gene_end";}
$9 ~ /ID=.*Name=/ {n=$9; sub(".*Name=", "", n); sub(";.*", "", n); gs[n]=$4; ge[n]=$5;}
$3~/.RNA/ {
n=$9; sub(".*Name=", "", n); sub(";.*", "", n);
p=$9; sub(".*Parent=", "", p); sub(";.*", "", p);
print n, p, "Desc", $1, $7, $4, $5, gs[p], ge[p];
}
' FS="\t" OFS="\t" input
1 Like

Thank you so much, this works perfectly.

Yes, but for future reference (when you post a question here):

https://www.unix.com/members-only/283755-effective-today-new-rule-3-forum-rules-new-post.html