#!/usr/bin/perl =head1 cdsgff2seq.pl read GFF, genome fasta; write CDS sequence, aa translation (with check for exon phase). write gene/exon offset subranges. =head1 usage -- CDS sequence cdsgff2seq -a dna -t exon region.gff chromfa.dir/ > out.cdna -- Amino translation cdsgff2seq -a aa -t CDS region.gff chromfa.dir/ > out.pep -- Subsequence ranges around gene,exon boundaries cdsgff2seq.pl -a genestart -offset=-99,9 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}/perchr/ > ${dp}-sc7g5.fa cdsgff2seq.pl -a intronstart -offset=-19,20 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}/perchr/ > ${dp}-sc7in1.fa cdsgff2seq.pl -a intronend -offset=-19,20 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}/perchr/ > ${dp}-sc7in2.fa options: -v verbose ; -t CDS restrict to this feature (CDS,exon default) -a dna or -a aa : option to output aa translations, cds dna =head1 Requirements Uses BioPerl Bio::SeqIO to read sequence files for exon subranges Inputs: 1. GFF v3 for feature locations (exons, CDS). Parent= or ID= must distinguish genes. 2. Directory of FastA genome sequence matching GFF Reference field (e.g. faSplit byname genome.fasta) Uses dna fasta file per scaffold/chromosome as $dnapath/$ref.fa =head1 Author d.gilbert, aug2006, gilbertd@indiana.edu updated march 2007 for gene/exon subsets from cdsphase.pl, august 2006 =head1 subset seqs 07mar addition: subset sequence output options choose range around exons to output: 5prime, 3prime,intron regions +/- n bases from boundary -a genestart geneend intronstart intronend : fixme -offset -50,10 * want option to do several/all -astart/end -offset x,y to separate files in one pass =head1 example subsets # get some gff gene models (est6 six-pack) curl 'http://insects.eugenes.org/species/cgi-bin/gbrowse/dvir/?name=scaffold_12855:9338335..9363334;label=NCBI_GNO;plugin=GFFDumper;plugin_do=Go' > dvir-est6gno2.gff set dp=dvir perl cdsgff2seq.pl -a genestart -o=-99,9 -t exon -gff ${dp}-est6gno.gff -fasta $sc/${dp}3/perchr/ > ${dp}-gb1.fa perl cdsgff2seq.pl -a geneend -o=-9,99 -t exon -gff ${dp}-est6gno.gff -fasta $sc/${dp}3/perchr/ > ${dp}-ge1.fa perl cdsgff2seq.pl -a intronstart -o=-19,20 -t exon -gff ${dp}-est6gno.gff -fasta $sc/${dp}3/perchr/ > ${dp}-in1.fa perl cdsgff2seq.pl -a intronend -o=-19,20 -t exon -gff ${dp}-est6gno.gff -fasta $sc/${dp}3/perchr/ > ${dp}-in2.fa set dp=dpulex perl cdsgff2seq.pl -a genestart -o=-99,9 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}4/perchr/ > ! ${dp}-sc7g5.fa perl cdsgff2seq.pl -a intronend -o=-19,20 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}4/perchr/ > ! ${dp}-sc7in2.fa perl cdsgff2seq.pl -a intronstart -o=-19,20 -t exon -gff ${dp}_scaf7*.gff -fasta $sc/${dp}4/perchr/ > ! ${dp}-sc7in1.fa # use WebLogo for intron/exon junction and 5' logos http://bespoke.lbl.gov/weblogo/ # or find motifs with .......... See http://www.sanger.ac.uk/Software/analysis/nmica/ for analyzing gene subset sequences for motifs. set mp=geall ; $nm/makemosaicbg -seqs $mp.fa -mosaicClasses 1 -mosaicOrder 1 -out $mp.sbg $nm/motiffinder -out $mp-mot.xms -seqs $mp.fa -backgroundModel $mp.sbg -numMotifs 3 =cut use strict; use lib("/bio/argos/common/perl/lib/"); use Bio::SeqIO; my $verbose= 0; my $outh = *STDOUT; my $gffin= *STDIN; my $MIN_CDS_LEN = 60; my $MIN_AA_LEN = 20; my $trustphase= 0; # each predictor uses diff feature types; do we also need mRNA,gene parent? my @cdsTypes= qw(CDS exon); # FIXME my %cdsTypes= map { $_ => 1; } @cdsTypes; my %seenType=(); my $seqtype= "dna"; # dna/cds aa/protein # expect args: cdsgff2seq.pl data.gff[.gz] dnafastapath/ my %args=(); use Getopt::Long; my $ok= &GetOptions( \%args, "gff=s" , "fasta|dnafastapath=s" , "t|types|features=s" , "a|seqtype=s" , ## dna,aa,protein,... ## a added: genestart,geneend,intronstart,intronend "o|offset=s" , ## -50..0/genestart ; 0..30/geneend ; -10..40/intronstart ; -40..10/intronend "casechange!" , ## for -n .. +n, change seq case to reflect -/+ boundary "v|verbose!" , "debug!" , ); my $gff= $args{gff} || shift(@ARGV); my $genomefasta= $args{fasta} || shift(@ARGV); die "Usage: cdsgff2seq.pl [-verbose] [-adna,protein] [-tCDS] annot.gff[.gz] dnafastapath/\n" unless($ok && -f $gff && -d $genomefasta); if($gff =~ /\.gz$/) { $ok= open(GFF, "gunzip -c $gff|"); $gffin = *GFF; } elsif($gff =~ /stdin/) { $ok=1; $gffin=*STDIN; } else { $ok= open(GFF,$gff); $gffin = *GFF; } my $debug= $args{debug}; my $casechange= $args{casechange}; my @offset= ($args{o}) ? split(/[,;.]/,$args{o}) : (); if($args{v}){ $verbose= $args{v}; } if($args{t}){ @cdsTypes=split(/[,;.-]/,$args{t}); } if($args{a}){ $seqtype=$args{a}; } $verbose= 1 if($debug); processGff($gffin,$genomefasta,$outh); close($gffin); close($outh); #-------------------- sub processGff { my($gffHandle,$orggenomefasta,$outh)= @_; %seenType=(); %cdsTypes= map { $_ => 1; } @cdsTypes; warn "# cdsgff2seq: $orggenomefasta\n" if $verbose; my($faIO, $refdna)= (undef, undef); my($l_id, $l_ref, $skipref)=("")x3; my( @feats,@cds); # any patch for $ref to gff-ref ? e.g. dpse 'Ch'; dper/dsec super/scaffold; ... ## assume cds grouped by ID in gff while(my $gffin= <$gffHandle>) { if($gffin =~ m/^\W/) { next; } ## print $outh $gffin; ## need to collect all CDS_exons/gene-mRNA first ... my @gff= split "\t",$gffin,9; my $ref = $gff[0]; my $type= $gff[2]; chomp($gff[8]); my $id= $gff[8]; $id =~ s/^(ID|Parent)=//; $id =~ s/[;,\s].*$//; $seenType{$type}++; # check here for exon AND CDS : drop exon if have CDS #?? %cdsTypes=("CDS" => 1) if( $type eq "CDS" ); #$cdsTypes{"exon"}= 0 if($id ne $l_id && @cds) { my $cdsnew= cds2seq( $refdna, \@cds, $seqtype); # sorted @cds / gene; not all @feats # foreach my $gff (@$cdsnew) { print $outh join("\t",@$gff),"\n"; } print $outh $cdsnew; # fully formatted? @cds=(); # @feats=(); } if($ref ne $l_ref) { undef $refdna; undef $faIO; $skipref=0; my $fafile= "$orggenomefasta/$ref.fa"; # check here for $ref.fa variations: dpse: Ch2..ChX.. unless(-e $fafile) { my $fref= $ref; if($fref =~ /^Ch/) { $fref =~ s/^Ch//; } elsif($fref =~ /^super/) { $fref =~ s/^super/scaffold/; } $fafile= "$orggenomefasta/$fref.fa"; } unless(-e $fafile) { warn "*** Cant find dna.fa for $ref at $fafile\n"; $skipref=1; } else { $faIO = Bio::SeqIO->new(-file => $fafile,-format => 'Fasta'); $refdna = $faIO->next_seq(); # wait till have gff ? } # warn "# cdsgff2seq[$ref]: $fafile\n" if $verbose; $l_ref= $ref; } if( $cdsTypes{$type} && !$skipref) { push(@cds,\@gff); } # else { print $outh $gffin; } #? dont need to keep $l_id= $id; $l_ref= $ref; } if(@cds) { my $cdsnew= cds2seq( $refdna, \@cds, $seqtype); # sorted @cds / gene; not all @feats # foreach my $gff (@$cdsnew) { print $outh join("\t",@$gff),"\n"; } print $outh $cdsnew; # fully formatted? @cds=(); } undef $refdna; undef $faIO; } sub cds2seq { my($refdna, $cdsA, $seqtype)= @_; ## assume cdsA are all/only cds exon set for one gene/mrna return "" unless(ref $refdna); my $cstrand= $cdsA->[0]->[6]; my $isrev= ($cstrand eq '-' || $cstrand < 0); my @cds; # sort by start if ($isrev) { @cds= sort{ $b->[3] <=> $a->[3] } @$cdsA; } # end 1st else { @cds= sort{ $a->[3] <=> $b->[3] } @$cdsA; } # start 1st my $nt_length= 0; my $ispartial= 0; my $phase0= 0; my $cdsdna= ""; my @exondna=(); my $id=""; my ($gref,$gstart,$gstop,$gtype,$gsrc,$gstrand,$gattr)= ("")x10; my $header=""; my $issubset= ($seqtype =~ /start|end/i); foreach my $ix (0 .. $#cds) { my($ref,$src,$type,$start,$stop,$score,$strand,$phase,$attr)= @{$cds[$ix]}; my($rstart,$rstop)= ($start,$stop); my($offa,$offb)= @offset; if($issubset && @offset) { if($offa>$offb) { ($offa,$offb)= ($offb,$offa); } ($start,$stop)= ($stop,$start) if($isrev); ($offa,$offb) = (-$offa,-$offb) if($isrev); if ($seqtype =~ /genestart|intronend/i) { ($start,$stop)= ($start + $offa, $start + $offb); } elsif ($seqtype =~ /geneend|intronstart/i) { ($start,$stop)= ($stop + $offa, $stop + $offb); } } #?? fixme for offset err? if($start>$stop) { ($start,$stop)= ($stop,$start); } if($ix == 0) { if($seqtype =~ /genestart/i) { $src.="-g5"; } elsif($seqtype =~ /geneend/i) { $src.="-g3"; } elsif($seqtype =~ /intronstart/i) { $src.="-in5"; } elsif($seqtype =~ /intronend/i) { $src.="-in3"; } $id=$attr; $id =~ s/^(ID|Parent)=//; $id =~ s/[;,\s].*$//; $gattr=$attr; if($gattr=~s/^[^;]+;// && $gattr=~/\w/) {$gattr.=";";} else {$gattr="";} ($gref,$gstart,$gstop,$gsrc,$gtype,$gstrand)= ($ref,$start,$stop,$src,$type,$strand); $ispartial= 0; $phase0= $phase || 0; #? use for atg ? } my($bstart,$blen)= ($start - 1, $stop - $start + 1); if($seqtype =~ /genestart/i) { next unless($ix == 0); } elsif($seqtype =~ /geneend/i) { next unless($ix == $#cds); } elsif($seqtype =~ /intronstart/i) { next unless($ix >=0 && $ix < $#cds); } elsif($seqtype =~ /intronend/i) { next unless($ix >0 && $ix <= $#cds); } my $exondna = substr( $refdna->seq(), $bstart, $blen); if($isrev) { $exondna = reverse $exondna; $exondna =~ tr/gatcGATC/ctagCTAG/; } # if($issubset && $casechange) { # # $offa < $offb ... both may be neg or pos. # my $ibeg= ($rstart - $bstart+1); # my $iin= ($rstop - $rstart); # my $iend= ($stop - $rstop); # my $xb= substr($exondna,0,$ibeg); # my $xi= substr($exondna,$ibeg,$iin-$ibeg); # my $xe= substr($exondna,$iin,$iend-$iin); # $xb =~ tr/A-Z/a-z/; # $xi =~ tr/a-z/A-Z/; # $xe =~ tr/A-Z/a-z/; # $exondna = $xb . $xi . $xe; # } $cdsdna.= $exondna; ## ^^ FIXME: don't append like this for $issubset ; need each exon/intron separate fasta entry push(@exondna,$exondna); if( $ix == 0 ) { # only for cds types ... ## my $inc5= 0; ## 1st exon; find start ATG; ** only need 3 bases at start, not all unless($issubset) { my $atg= substr($exondna, $phase0, 3); $ispartial=1 if($atg !~ /atg/i); } } else { $gstart=$start if($start<$gstart); $gstop= $stop if($stop>$gstop); } } my $minlen= $MIN_CDS_LEN; if($seqtype =~ /aa|prot/i) { my($inc5,$protaa,$aascore); if(! $ispartial || ( $trustphase && $phase0 =~ /\d/ )) { ($inc5,$protaa,$aascore) = getBestFrame( $cdsdna, $id, ! $ispartial, $phase0); } else { ($inc5,$protaa,$aascore) = getBestFrame( $cdsdna, $id, ! $ispartial); } $cdsdna= $protaa; $minlen= $MIN_AA_LEN; } elsif($issubset) { $minlen=1; } my $slen= length($cdsdna); return "" if($slen < $minlen); if($issubset) { my $i=0; my $fa=""; foreach my $ex (@exondna) { $i++; $header=">$id.$i loc=$gref:$gstart-$gstop:$gstrand;type=$gtype.$gsrc;${gattr}len=$slen"; $ex =~ s/(.{1,50})/$1\n/g; $fa .= $header."\n".$ex; } return $fa; } else { $header=">$id loc=$gref:$gstart-$gstop:$gstrand;type=$gtype.$gsrc;${gattr}len=$slen"; $header .= ";partial_gene=true" if ($ispartial); return $header."\n" if($debug); $cdsdna =~ s/(.{1,50})/$1\n/g; return $header."\n".$cdsdna; } } ## translation methods my @s5CodonTable = (); BEGIN{ @s5CodonTable = ( [ ['K','N','K','N','X',], ['T','T','T','T','T',], ['R','S','R','S','X',], ['I','I','M','I','X',], ['X','X','X','X','X',], ], [ ['Q','H','Q','H','X',], ['P','P','P','P','P',], ['R','R','R','R','R',], ['L','L','L','L','L',], ['X','X','X','X','X',], ], [ ['E','D','E','D','X',], ['A','A','A','A','A',], ['G','G','G','G','G',], ['V','V','V','V','V',], ['X','X','X','X','X',], ], [ ['*','Y','*','Y','X',], ['S','S','S','S','S',], ['*','C','W','C','X',], ['L','F','L','F','X',], ['X','X','X','X','X',], ], [ ['X','X','X','X','X',], ['X','X','X','X','X',], ['X','X','X','X','X',], ['X','X','X','X','X',], ['X','X','X','X','X',], ], ); } sub ibase { my $c= substr($_[0],$_[1],1); return 0 if ($c eq 'A'); return 1 if ($c eq 'C'); return 2 if ($c eq 'G'); return 3 if ($c eq 'T'); return 4; } sub translate { my($cds, $offset)= @_; $cds = uc($cds); ## fix chars ?? my $aa=""; my $aa_length = int((length($cds) - $offset) / 3); for (my $i = 0; $i < $aa_length; $i++) { my $idx = 3 * $i + $offset; $aa .= $s5CodonTable[ ibase($cds,$idx)][ ibase($cds,$idx+1) ][ ibase($cds,$idx+2) ]; } return $aa; } sub getBestFrame { my($cds, $id, $isfullcds, $usephase)= @_; my ($bestscore,$besti,$bestpro)= (-999999,0,""); my $ph0= ($usephase =~ /\d/) ? $usephase : 0; my $nframe= ($isfullcds ? $ph0+1 : 3); for (my $i= $ph0; $i<$nframe; $i++) { my $pro= translate( $cds, $i ); my $score = $pro =~ tr/*/*/; # has_internal_stops($pro); #is inner M bad?# $score += $pro =~ tr/M/M/; # has_internal_starts($pro); $score *= -3; if (substr($pro,length($pro)-1,1) eq '*') { $score += 4; } # adj internal == end if (substr($pro,0,1) eq 'M') { $score += 1; } if ($score > $bestscore) { $besti= $i; $bestscore=$score; $bestpro=$pro; } warn("# bestFrame[$i,$id]: $score ; $pro \n") if($verbose); # debug } return wantarray ? ($besti,$bestpro,$bestscore) : $besti; } __END__ # sub cdsPhase { # my($refdna, $cdsA)= @_; # ## assume cdsA are all/only cds exon set for one gene/mrna # return $cdsA unless(ref $refdna); # # my $cstrand= $cdsA->[0]->[6]; # my $isrev= ($cstrand eq '-' || $cstrand < 0); # # my @cds; # sort by start # if ($isrev) { @cds= sort{ $b->[3] <=> $a->[3] } @$cdsA; } # end 1st # else { @cds= sort{ $a->[3] <=> $b->[3] } @$cdsA; } # start 1st # my $nt_length= 0; # my $ispartial= 0; # # foreach my $ix (0 .. $#cds) { # my($ref,$src,$type,$start,$stop,$score,$strand,$phase,$attr)= @{$cds[$ix]}; # my $id=$attr; $id =~ s/^(ID|Parent)=//; $id =~ s/[;,\s].*$//; # # do we check exon ordering? use as given in gff? need 1st .. last, differs for strands # # if($ix == 0) { # ## 1st exon; find start ATG; ** only need 3 bases at start, not all # $ispartial= 0; # my($bstart,$blen)= ($start - 1, $stop - $start + 1); # #my($bstart,$blen)= ($isrev) ? ($stop-8,8) : ($start-1, 8); # my $exondna = substr( $refdna->seq(), $bstart, $blen); # if($isrev) { # $exondna = reverse $exondna; # $exondna =~ tr/gatcGATC/ctagCTAG/; # } # my $inc5= 0; # for (; $inc5<=3; $inc5++) { # my $atg= substr($exondna, $inc5, 3); # last if($atg =~ /atg/i); # } # # ## fixme, if $ispartial probably need check best aa translation frame # ## yes; need full cds/all exons and translate() method # if ($inc5 > 2) { # $nt_length = 0; $ispartial=1; $inc5 = 0; #start not found/incomplete prot ? # my $cdsdna= $exondna; # foreach my $ex (1 .. $#cds) { # my($ref1,$src1,$type1,$start1,$stop1,$score1,$strand1,$phase1,$attr1)= @{$cds[$ex]}; # my($bstart,$blen)= ($start1 - 1, $stop1 - $start1 + 1); # my $exon2dna = substr( $refdna->seq(), $bstart, $blen); # if($strand1 eq '-' || $strand1 < 0) { # $exon2dna = reverse $exon2dna; # $exon2dna =~ tr/gatcGATC/ctagCTAG/; # } # $cdsdna.= $exon2dna; # } # $inc5 = getBestFrame( $cdsdna, $id); # } # # if ($inc5 == 1) { $nt_length = 2; } # elsif ($inc5 == 2) { $nt_length = 1; } # else { $nt_length = 0; } # } # # my($inc5,$inc3,$elength,$frame); # $elength = $stop - $start + 1; # $nt_length += $elength; # $inc3 = $nt_length % 3; # $inc5 = ($elength - $inc3) % 3; # only care about this one # $frame = ($start + $inc5) % 3; # if ($inc5 == -1) { $inc5 = 2; } # # my $changed=0; # if ($phase eq '.') { $changed=1; } # elsif ($phase ne $inc5 ) { # $changed=2; # warn "# phase change exon[$ix]: $phase => $inc5; $ref:$start-$stop/$strand,$type:$src,$id\n" if $verbose; # } # if($changed) { $cds[$ix]->[7]= $inc5; } # if($ispartial && $ix == 0) { $cds[$ix]->[8] .= ";partial_gene=true"; } # 5prime_partial=true; 3prime.. # } # # return \@cds; # } #item fasta db # dbm based index; not platform independent my $dna_db = Bio::DB::Fasta->new('/path/to/fasta/files'); my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000); my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000); my $obj = $db->get_Seq_by_id('CHROMOSOME_I'); my $seq = $obj->seq; my $subseq = $obj->subseq(4_000_000 => 4_100_000); #-- or plat-independent (inherits from above) -- my $lucene = new Bio::DB::GFF::Adaptor::lucene(xxx); my $dna_db = new Bio::DB::GFF::Adaptor::LuceneFasta( $fafile, _adaptor => $lucene ); #cut ## ATG is universal start codon for euk. nuclear genes ## for start= 0 .. 2, find ATG ## ? and check aatrans, internal *-stops, aa[0] == 'M', aa[-1]= '*' ## parts from zoeCDS.c of Snap/I.Korf if (exon->strand == '+') { c1 = dna->s5[exon->start]; c2 = dna->s5[exon->start +1]; c3 = dna->s5[exon->start +2]; if (c1 == 0 && c2 == 3 && c3 == 2) ef.start = 1; //# ATG } else if (exon->strand == '-') { c1 = dna->s5[exon->end]; c2 = dna->s5[exon->end -1]; c3 = dna->s5[exon->end -2]; if (c1 == 3 && c2 == 0 && c3 == 1) ef.start = 1; //# TAC (~ATG) } #.......... best_score = -1000000; best_idx = -1; for (i = 0; i <= 2; i++) { pro[i] = zoeTranslateDNA(tx->def, tx, i); score = - 3 * has_internal_stops(pro[i]); if (pro[i]->seq[pro[i]->length -1] == '*') score++; if (pro[i]->seq[0] == 'M') score++; if (score > best_score) { best_score = score; best_idx = i; } } bt.inc5 = best_idx; bt.inc3 = (tx->length - best_idx) % 3; bt.aa = pro[best_idx]; #.......... ## find phase == inc5 for all exons: ## /* label with correct phase and frame */ if (cds->inc5 == 1) nt_length = 2; else if (cds->inc5 == 2) nt_length = 1; else nt_length = 0; elength = 0; /*orig* for (i = 0; i < cds->exons->size; i++) */ { int i, exstart, exend, exinc; if(cds->strand == '-') { exstart= cds->exons->size-1; exend=-1; exinc=-1; } else { exstart= 0; exend= cds->exons->size; exinc=1; } for (i = exstart; i != exend; i += exinc ) { exon = cds->exons->elem[i]; elength = exon->end - exon->start + 1; nt_length += elength; inc3 = nt_length % 3; inc5 = (elength - inc3) % 3; frame = (exon->start + inc5) % 3; exon->frame = frame; exon->inc5 = inc5; if (exon->inc5 == -1) exon->inc5 = 2; exon->inc3 = inc3; if (!zoeVerifyFeature(exon)) { zoeWarn("exon does not validate after correcting phase & frame"); zoeExit("%s", cds->name); } } }