online Appendix 4 Perl script to generate LR, noLR, LRall and noLRall character sets Regier, J.C., Shultz, J.W., Ganley, A.R.D., Hussey, A., Shi, D., Ball, B., Zwick, A. Stajich, J.E., Cummings, M.P., Martin, J.W., and Cunningham, C.W. 2008. Resolving Arthropod Phylogeny: Exploring Phylogenetic Signal within 41kb of Protein-coding Nuclear Gene Sequence. Syst. Biol. 57:920-938. ----------------------------------------------------------------- #!/usr/bin/perl # ./LeuArg1.pl # Input file must be in flat format. # Code written by April Hussey & Paul Donohue # on 02-03-06. # First published in Systematic Biology, 2008, Volume 57, pp. 920-938 -- # Title: "Resolving Arthropod Phylogeny: Exploring Phylogenetic Signal # within 41 kb of Protein-Coding Nuclear Gene Sequence" # Authors: Regier, Shultz, Ganley, Hussey, Shi, Ball, Zwick, Stajich, # Cummings, Martin and Cunningham # Output directory contains files of LRx, LRallx, noLRx and noLRallx # character sets, where x = the minimum number of Leu or Arg residues # encoded by nt1 (= first codon position) at all individual nt1 # characters in the LRx or LRallx character set. # LR1 = all nt1 characters that encode 1 or more Leu or Arg residues. # LR2 = all nt1 characters that encode 2 or more Leu or Arg residues... # The maximum possible number of Leu or Arg residues at any given character # = the total number of sequences in the data matrix. # LRallx includes, in addition to LRx, polymorphic characters that encode # Leu or Arg. # LRall2 = all nt1 characters that encode 2 or more Leu or Arg residues, # including at polymorphic sites. # nt1 = LRx + noLRx = LRallx + noLRallx ########################################################################### # $ARGV[0] holds # Store file contents in an array open(INPUT, "$ARGV[0]"); @FileContents = ; close(INPUT); # Create a directory for the output files, then enter that directory mkdir($ARGV[0]."_Data", 0755); chdir($ARGV[0]."_Data"); # Join the array into a single string $ConcatContents = join('', @FileContents); # Split the string at the #s and store each remaining fragment as an item in a new array @AllSequences = split('#', $ConcatContents); # Remove the 1st blank item in the array (preceding the 1st #) shift(@AllSequences); # Remove sequence names and end of line characters foreach $Consensus (@AllSequences) { $Consensus =~ s/^(.*)\n//; $Consensus =~ s/[ \n]//sg; } ########################################################################### # Find the length of the sequences (they should all be the same length) $ConsensusLength = length($AllSequences[1]); # Find highlighted codons in each sequence foreach $Consensus (@AllSequences) { # Jump through the sequence and check each codon for($Position = 0; $Position < $ConsensusLength; $Position += 3) { # Extract the codon at the current position from the sequence $Codon = substr($Consensus, $Position, 3); # Compare the codon to our lists (using regular expressions) if($Codon =~ m/CT[ABCDGHKMNRSTVWY?-]|TT[AGR?-]|YT[AGR?-]/i or $Codon =~ m/CG[ABCDGHKMNRSTVWY?-]|AG[AGR?-]|MG[AGR?-]/i) { # If the codon matches one of the codons in the list, increment the value of the array at that position # (if a position in this array is never incremented, it will be empty, not 0) @UnambiguousCodons[$Position]++; @UACodonList[$Position] .= $Codon . " "; } } } # Repeatedly search through the resulting array and print lists of positions in output files for($Matches = 1; $Matches <= $#AllSequences+1; $Matches++) { # LR File open(LR, ">LR$Matches"); $LineString = ""; # Outside for() loop and if() statement find first match for($Position = 0; $Position < $ConsensusLength; $Position += 3) { if(@UnambiguousCodons[$Position] >= $Matches) { $CodonString = ($Position+1); # The rest of this takes care of chopping lines at 60 characters and printing the lines to the file if(length($LineString) + length($CodonString) > 60) { chop($LineString); print LR $LineString . "\n"; $LineString = $CodonString . " "; } else { $LineString .= $CodonString . " "; } } } chop($LineString); print LR $LineString; close(LR); # NOLR File open(NOLR, ">noLR$Matches"); $LineString = ""; # Outside for() loop and if() statement find first match for($Position = 0; $Position < $ConsensusLength; $Position += 3) { if(@UnambiguousCodons[$Position] < $Matches) { $CodonString = ($Position+1); # The rest of this takes care of chopping lines at 60 characters and printing the lines to the file if(length($LineString) + length($CodonString) > 60) { chop($LineString); print NOLR $LineString . "\n"; $LineString = $CodonString . " "; } else { $LineString .= $CodonString . " "; } } } chop($LineString); print NOLR $LineString; close(NOLR); } ########################################################################### # Find all codons in each sequence foreach $Consensus (@AllSequences) { # Jump through the sequence and check each codon for($Position = 0; $Position < length($Consensus); $Position += 3) { # Extract the codon at the current position from the sequence $Codon = substr($Consensus, $Position, 3); # Compare the codon to our lists (using regular expressions) if($Codon =~ m/CT[ABCDGHKMNRSTVWY?-]|TT[ABDGHKMNRSVW?-]|YT[ABCDGHKMNRSTVWY?-]/i or $Codon =~ m/CG[ABCDGHKMNRSTVWY?-]|AG[ABDGHKMNRSVW?-]|MG[ABCDGHKMNRSTVWY?-]/i) { # If the codon matches one of the codons in the list, increment the value of the array at that position # (if a position in this array is never incremented, it will be empty, not 0) @AllCodons[$Position]++; # Store all unhighlighted codons in an array if($Codon =~ m/TT[BDHKMNSVW?-]|YT[BCDHKMNSTVWY?-]/i or $Codon =~ m/AG[BDHKMNSVW?-]|MG[BCDHKMNSTVWY?-]/i) { @ACodonList[$Position] .= " " . $Codon; } } } } # Repeatedly search through the resulting array and print lists of positions in output files for($Matches = 1; $Matches <= $#AllSequences+1; $Matches++) { # LRall File open(LRall, ">LRall$Matches"); $LineString = ""; # Outside for() loop and if() statement find first match for($Position = 0; $Position < $ConsensusLength; $Position += 3) { if(@AllCodons[$Position] >= $Matches) { $CodonString = ($Position+1); # The rest of this takes care of chopping lines at 60 characters and printing the lines to the file if(length($LineString) + length($CodonString) > 60) { chop($LineString); print LRall $LineString . "\n"; $LineString = $CodonString . " "; } else { $LineString .= $CodonString . " "; } } } chop($LineString); print LRall $LineString; close(LRall); # NOLRall File open(NOLRall, ">noLRall$Matches"); $LineString = ""; # Outside for() loop and if() statement find first match for($Position = 0; $Position < $ConsensusLength; $Position += 3) { if(@AllCodons[$Position] < $Matches) { $CodonString = ($Position+1); # The rest of this takes care of chopping lines at 60 characters and printing the lines to the file if(length($LineString) + length($CodonString) > 60) { chop($LineString); print NOLRall $LineString . "\n"; $LineString = $CodonString . " "; } else { $LineString .= $CodonString . " "; } } } chop($LineString); print NOLRall $LineString; close(NOLRall); } ########################################################################### open(CODONLIST, ">CodonList"); printf CODONLIST "%-11s %".(($#AllSequences+1)*4)."s|%s\n", "Position", "Unambiguous ", " Ambiguous"; for($Position = 0; $Position < $ConsensusLength; $Position += 3) { if(@AllCodons[$Position] != "") { printf CODONLIST "%5s-%-5s %".(($#AllSequences+1)*4)."s|%s\n", ($Position+1), ($Position+3), (@UACodonList[$Position]), (@ACodonList[$Position]); } } close(CODONLIST);