#!/usr/bin/perl # the path to your perl distribution may be different # to determine the path, type: which perl # this script will take the standard output from probA and convert each # suboptimal alignment into a file that can be read by MODELLER # this program was written by stewart pomerantz and adam c marko # last modification was 1.13.2006 # questions please contact marko@psc.edu ############################################## # PROBA2PIR v0.1 ############################################## # To run this: # type: proba2pir.pl -if -of -structure u/l # -first -last # -if refers to the proba output file, you must capture it with 'script' or copy paste # -of refers to the name you wish to have on the ali files # -structure = which seqeuence id either upper (u) or lower (l) # is associated with a structure (template) # -first = first residue of protein structure (template) # -last = last residue of protein structure (template) # # Running this script successfully should result in # an .ali file for each suboptimal alignment # # usage statement $nargs = scalar(@ARGV) ; #this checks the number of arguments/gives usage if( $nargs != 10 ) { print("usage: proba2pir.pl -if -of \n") ; print(" -structure -first -last \n") ; exit(1) ; } for( $i = 0 ; $i < 10 ; $i++ ) { if( $ARGV[$i] eq "-if" ) { $inputfile = $ARGV[$i+1] ; next ; } if( $ARGV[$i] eq "-of" ) { $outputfile = $ARGV[$i+1] ; next ; } if( $ARGV[$i] eq "-structure" ) { $structure_id = $ARGV[$i+1] ; next ; } if( $ARGV[$i] eq "-first" ) { $first_res = $ARGV[$i+1] ; next ; } if( $ARGV[$i] eq "-last" ) { $last_res = $ARGV[$i+1] ; next ; } } print("==== input ====\n") ; print("input file: '$inputfile'\n") ; print("output file: '$outputfile'\n") ; print("structure id: '$structure_id'\n") ; print("first res: '$first_res'\n") ; print("last res: '$last_res'\n") ; print("===============\n") ; open(FILE,"<".$inputfile) || die "Error: Could not open file '$inputfile'. Exiting.\n" ; @lines = ; close(FILE) ; $nlines = scalar(@lines) ; ## ## capture the sequence info ## # $sequence_no = 0 is uppper sequence # $sequence_no = 1 is lower sequence # $start = 0 ; $middle = 0 ; for($i = 0, $sequence_no = -1 ; $i < $nlines ; $i++ ) { if( $lines[$i] =~ /# > (\w*)/ ) { $sequence_no++ ; $sequence_name[$sequence_no] = $1 ; $start = 1 ; next ; } if( $start == 1 ) { $sequence[$sequence_no] = $sequence[$sequence_no] . $lines[$i] ; $start = 0 ; $middle = 1 ; next ; } if( $middle == 1 ) { if( $lines[$i] =~ /#/ ) { $middle = 0 ; if( $sequence_no == 1 ) { last ; } next ; } $sequence[$sequence_no] = $sequence[$sequence_no] . $lines[$i] ; } } printf("name: '$sequence_name[0]'\n") ; $sequence[0] =~ s/# // ; $sequence[0] =~ s/\n//g ; printf("seq: '$sequence[0]'\n") ; printf("name: '$sequence_name[1]'\n") ; $sequence[1] =~ s/# // ; $sequence[1] =~ s/\n//g ; printf("seq: '$sequence[1]'\n") ; ## ## capture the alignment info ## $nextline = $i ; $foundstart = 0 ; for($i = $nextline, $alignment_no = -1 ; $i < $nlines ; $i++ ) { if( ($lines[$i] =~ /[\|\:\.\^]/) && ($foundstart == 0) ) { $alignment_no++ ; $alignment[$alignment_no] = $lines[$i] ; $foundstart = 1 ; } if( ($lines[$i] =~ /\d/) && ($foundstart == 1) ) { $foundstart = 0 ; } if( $alignment_no > -1 ) { $alignment[$alignment_no] = $alignment[$alignment_no] . $lines[$i] ; } } for( $i = 0 ; $i <= $alignment_no ; $i++ ) { $alignment[$i] =~ s/\n//g ; @result = split(' ',$alignment[$i]) ; $alignment[$i] = $result[0] ; # print("alignment $i: $alignment[$i]\n") ; } ## ## the alignment phase ## for( $i = 0 ; $i <= $alignment_no ; $i++ ) { $ot_seq[0] = "" ; $ot_seq[1] = "" ; $idx0 = 0 ; $idx1 = 0 ; #print("translation for alignment $i:\n") ; #this translates the output to an alignment for( $j = 0 ; $j < length($alignment[$i]) ; $j++ ) { $char = substr($alignment[$i],$j,1) ; ### fill in code here if( ($char eq '|') || ($char eq ':') ) { $letter = substr($sequence[0],$idx0,1) ; $ot_seq[0] = $ot_seq[0] . $letter ; $idx0++ ; $letter = substr($sequence[1],$idx1,1) ; $ot_seq[1] = $ot_seq[1] . $letter ; $idx1++ ; } if( $char eq '.' ) { $letter = substr($sequence[0],$idx0,1) ; $ot_seq[0] = $ot_seq[0] . $letter ; $idx0++ ; $ot_seq[1] = $ot_seq[1] . "-" ; } if( $char eq '^' ) { $ot_seq[0] = $ot_seq[0] . "-" ; $letter = substr($sequence[1],$idx1,1) ; $ot_seq[1] = $ot_seq[1] . $letter ; $idx1++ ; } } # print("$ot_seq[0]\n") ; # print("$ot_seq[1]\n") ; # # output to the file # #This will normalize the names so that the numbers all have 6 characters $normal=0; $namelength=length($i); until ($namelength == 6) { $i = $normal . $i; $namelength=length($i); } $filename = "$outputfile" . "_$i.ali" ; open(FILE,">".$filename) || die "could not open file '$filename'. Exiting. " ; # sequence 0 if ( $structure_id eq "u" ) { print FILE (">P1; $sequence_name[0]\n") ; } else { print FILE (">P1; $sequence_name[0]_$i\n") ; } if( $structure_id eq "u" ) { $test = "structure:$sequence_name[0]:" ; $test = $test . "$first_res" ; $test = $test . ":A:" ; $test = $test . "$last_res" ; $test = $test . ":A::::" ; print FILE ("$test\n") ; } else { $test2 = "sequence:$sequence_name[0]_$i" ; $test2 = $test2 . "::::::::"; print FILE ("$test2\n") #print FILE ("sequence:$sequence_name[0]_$i\n") ; } print FILE ("$ot_seq[0]*\n") ; # sequence 1 if ( $structure_id eq "l" ) { print FILE (">P1; $sequence_name[1]\n") ; } else { print FILE (">P1; $sequence_name[1]_$i\n") ; } if( $structure_id eq "l" ) { $test = "structure:$sequence_name[1]:" ; $test = $test . "$first_res" ; $test = $test . ":A:" ; $test = $test . "$last_res" ; $test = $test . ":A::::" ; print FILE ("$test\n") ; } else { $test2 = "sequence:$sequence_name[1]_$i" ; $test2 = $test2 . "::::::::"; print FILE ("$test2\n") } print FILE ("$ot_seq[1]*\n") ; close(FILE) ; print("wrote $filename\n") ; # tells us that the files have been written }