#!/usr/bin/perl use Getopt::Long; use File::Path; my($input, $output, $nbSeqByFile, $nbHitBySeq, $nbHspByHit, $help, $queryByFile); $help = " - i|input : path of the input file (must be text blast file output) - o|output : path of the output file (by default, the same as the input file) - s|sequences : number of sequences by xml file (default inf) - hit : number of hit to print for each sequences (default inf) - hsp : number of hsp to print for each hit (default inf) - help|h|? : print this help and exit "; $nbHitBySeq="inf"+0; $nbHspByHit="inf"+0; $queryByFile="inf"+0; GetOptions( 'i|input=s' => \$input, 'o|output=s' => \$output, 's|sequences=s' => \$queryByFile, 'hit=s' => \$nbHitBySeq, 'hsp=s' => \$nbHspByHit, 'help|h|?' => sub{print $help; exit} ); ; if (!$input){print $help; exit} if (!$output){$output=$input} open (INPUT_FILE, '< '.$input) or die ("\nCannot read input file: $!\n"); open (OUTPUT_FILE, '> '.$output.'_1.xml') or die ("\nCannot create output file: $!\n"); my $compteurFile=1; my $compteurQuery=0; my $compteurHit=0; my $compteurHsp=0; my $compteurQueryTotal=0; my $line = ; $line =~ s/[\s\n\r]*$//g; my($version, $program) = $line=~/(([^\s]+)\s.+)/; $program=~tr/[A-Z]/[a-z]/; my $toPrint; my $header=" $program $version Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), \"A greedy algorithm for aligning DNA sequences\", J Comput Biol 2000; 7(1-2):203-14. $program $output $compteurFile $output $compteurFile 1 10 1 -2 0 0 L;m; "; print OUTPUT_FILE $header; while (my $line=){ $line =~ s/[\s\n\r]*$//g; # Bloc Iteration if ($line=~/Query= .*\s*/){ $compteurHit=0; $compteurHsp=0; $compteurQuery++; $compteurQueryTotal++; my $queryLength; my $seqId; while($line && $line !~ />.*/ ){ $line =~ s/[\s\n\r]*$//g; if($line=~/\(([0-9]*) letters\)/g) {$queryLength = $1} elsif($line=~/Query= (.*)\s*/g) {$seqId = $1}; $line=; } $toPrint = " $compteurQuery $seqId $seqId $queryLength "; if ($compteurQuery > $queryByFile){ print OUTPUT_FILE "\n\n"; close (OUTPUT_FILE); $compteurQuery = 0; $compteurFile++; $toPrint = $header."\n".$toPrint; open (OUTPUT_FILE, '> '.$output.'_'.$compteurFile.'.xml') or die ("\nCannot create output file: $!\n"); } print OUTPUT_FILE $toPrint; } # Bloc Hit if ($line=~/^>.*/){ $compteurHsp=0; $compteurHit++; my $hitId; my $hitDef; my $hitAccession="NONE"; my $length; while($line && $line!~/Score =\s+[0-9\.]+ bits \([0-9\.]+\), Expect = .+/){ $line =~ s/[\s\n\r]*$//g; if($line=~/Length = (.*)/g) {$length = $1} elsif($line=~/^>*([^\|]*\|*([^\|]+)\|*(.*))/g) {if ($hitAccession eq "NONE") {$hitAccession = $2}; $hitDef.=$3; $hitId.=$1." ";} $line=; } if(!$hitDef){$hitDef=$hitId} $toPrint = "$compteurHit\n$hitId $hitDef $hitAccession $length"; if ($compteurHit<=$nbHitBySeq) {print OUTPUT_FILE $toPrint}; } if ($line=~/Score =\s+[0-9\.]+ bits \([0-9\.]+\), Expect = .+/){ $compteurHsp++; my $bitScore; my $score; my $evalue; my $identities; my $length; my $positive; my $gap=0; my $qframe=1; my $hframe=1; my $debutq; my $finq; my $query; my $debuth; my $finh; my $hit; my $midline; my $retrait = ""; do{ $line =~ s/[\s\n\r]*$//g; if($line=~/Frame = \+?([^\+]*)/g) {$qframe=$1;} elsif($line=~/Score =\s+([0-9\.]+) bits \(([0-9\.]+)\), Expect = (.+)/g) {$bitScore = $1; $score=$2; $evalue=$3;} elsif($line=~/Identities = ([0-9]+)\/([0-9]+) \([0-9]+%\), Positives = ([0-9]+)\/[0-9]+ \([0-9]+%\)(, Gaps = ([0-9]+)\/[0-9]+ \([0-9]+%\))*/) {$identities = $1; $length=$2; $positive=$3; if($5){$gap=$5;}} elsif($line=~/(Query:\s+([0-9]+)\s+)([^0-9]+)\s+([0-9]+)/) {if(!$debutq){$debutq = $2} $query.=$3; $finq=$4; $retrait=length($1)} elsif($line=~/Sbjct:\s+([0-9]+)\s+([^0-9]+)\s+([0-9]+)/) {if(!$debuth){$debuth = $1} $hit.=$2; $finh=$3} elsif($line=~/^\s{$retrait,$retrait}(.*)/) {$midline.= $1} $line=; } while($line && ($line!~/^>.*/ && $line !~ /Query=/ && $line !~ /Score =\s+[0-9\.]+ bits \([0-9\.]+\), Expect = .+/)); $toPrint=" $compteurHsp $score $bitScore $evalue $debutq $finq $debuth $finh $qframe $hframe $identities $positive $gap $length $query $hit $midline "; if ($compteurHit <= $nbHitBySeq && $compteurHsp <= $nbHspByHit) { print OUTPUT_FILE $toPrint } if (!$line || ($line=~/^>.*/ && $compteurHit < $nbHitBySeq )|| $line=~/Query=/){print OUTPUT_FILE "\n\n"} if (!$line || $line=~/Query=/ ){print OUTPUT_FILE "\n\n\n"} if ($line){redo} else { print OUTPUT_FILE "\n\n"} } } close (OUTPUT_FILE);