#!/usr/bin/perl

=head1 NAME 

Design_EST_SSR_primers.pl  

=head1 Author

Mauricio La Rota,  Plant Breeding Dept, Cornell University, with a lot of support from the bioperl Packages.

=head1 SYNOPSIS

Design_EST_SSR_primers.pl  [-h[elp]]  [-host {server}] [-user {username}] [-p[ass] {PASSWD}] 

=head1 DESCRIPTION

 Fetch SSR coordinates and source sequence (the one used to detect the SSR) and control the Primer3 program
 to design primers around the SSR area.  Then write back the primers to the database
 Primers will be written to table EST_SSR.pcr_primer_set

 Uses bioperl to control Primer3 but not to connect to the DB.  This we implement directly.

 Warning:  The query has constraints for high quality stringent SSRs only,  see SQL query in script.

=head1 ARGUMENTS

-host
-user
-p[ass]
-dbname
-help

=cut


#use strict;
use DBI;
use Getopt::Long;
# use Bio::Tools::Primer3 ;  # This old version has been retired 10/24/2003 (Manual update from CVS)
use Bio::Tools::Run::Primer3;
use Bio::SeqIO;


my $host   = 'your.server.org' # place IP address of server here;
my $dbname = 'EST_SSR';
my $dbuser = 'guest';
my $dbpass = '';
#my $dbuser = 'perlscript';
#my $dbpass = 'perlscript';
my $sth = undef;
my $idx_spp = undef;
my $seqObj= undef;
my $help = undef;
my $PrimerSetRef = undef;

#the biosequence.database_id	# 3 hvgi, 4 osgi , 5 tagi
my %spphash=(barley => 3, 
			 rice   => 4,
			 wheat  => 5,
			);
# The name of the database tables for the gene indices at EST_SSR database:
my %gene_idx=(barley => "hvgi_est_ssr", 
			  rice   => "osgi_est_ssr",
			  wheat  => "tagi_est_ssr",
			);

# The columns to keep from the EST-SSR gene index SQL query
my %table_columns=qw(
	ESTSSRid			1
	seqid				1
	SSRType				1
	len					1
	start				1
	end					1
	biosequence_str		1
					);

&GetOptions( 'host=s'    => \$host,
	    'dbname=s'  => \$dbname,
	    'user=s' => \$dbuser,
	    'p|pass=s' => \$dbpass,
		'spp=s'  => \$idx_spp,
	    'h|help+' => \$help,
);
if(  defined $help  ) {
    system("perldoc $0");
    exit(0);
}
if( $idx_spp =~/\bbarley\b|\brice\b|\bwheat\b/ ) {
	print "\n\tProgram will extract coordinates and SSR sequences for $idx_spp gene index. \nPlease wait...\n";
	
}else {
	warn "\n!!! WARNING !!!:\n  '-spp' is required.  The argument for this tag is one of these: [wheat, rice, barley].\n\n\n";
	system("perldoc $0");
    exit(0);
}


# Process the species to work with (barley, wheat , or rice)
my $g_idx = $gene_idx{$idx_spp};
$idx_spp  = $spphash{$idx_spp};  #  CAUTION :  $idx_spp is now its number.
#print STDERR " converted spp to num $idx_spp\n";


#### need to connect to database and retrieve info 
#### for each SSR and source seq to design primers.

my $dsn = "DBI:mysql:database=$dbname;host=$host";
my $dbh = DBI->connect($dsn,$dbuser,$dbpass,{RaiseError=>1}) or die "Could not connect, error was:\n\t$DBI::errstr\n\n";

my $result_set_array_ref = &QueryDB;  # This Retrieves EST-SSR info from db


# Prepare the query statement to write to database, before getting into loop
my $insertSQL = qq[INSERT INTO EST_SSR.pcr_primer_set (gene_index_name,est_ssr_id,primer_set_rank,
	forw_primer_seq,forw_primer_start,forw_primer_length,
	forw_primer_tm,forw_primer_GC,rev_primer_seq,rev_primer_start,rev_primer_length,
	rev_primer_tm,rev_primer_GC,expected_product_size) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
];

my $sth_save_primers = $dbh->prepare($insertSQL);


# print "The contents of the table are: \n";

my $records =1; $timer_x=0;

# BIG LOOP
foreach my $ESTSSRrow (@$result_set_array_ref) { # for each record in table
#	last if ($records==500);  ##### For debugging only   #####
	#print STDERR (join("\t",@$ESTSSRrow)." \n");  # if object is an array of arrays
	#print "\n$ESTSSRrow->{seqid}\t$ESTSSRrow->{SSRType}\t$ESTSSRrow->{len}\n";  #if object is returned as array of hashes

	#### Create a Seq Object from this record
	$seqObj   = Bio::PrimarySeq->new ( -seq => $ESTSSRrow->{biosequence_str},
				   -id  => $ESTSSRrow->{ESTSSRid},  -accession_number => $ESTSSRrow->{seqid},
				   -alphabet => 'dna', -is_circular => 0
				   );

#	print("Sequence object contains: ID ".$seqObj->id()." Accession ".$seqObj->accession_number."\nSSR is ".$seqObj->subseq($ESTSSRrow->{start},$ESTSSRrow->{start}+$ESTSSRrow->{len}-1)." located at ".$ESTSSRrow->{start}.",".$ESTSSRrow->{end}."\n");



	#### Send to design primers
	$PrimerSetRef=&MakePrimerSet($seqObj,$ESTSSRrow->{start},$ESTSSRrow->{len});   #For now design only one pair.  Needs to be able to write to working dir a file "temp.out"
	
	# Set timer if you need a short delay between parsing results and writing to database
	#$timer_x=0; 
	#while ($timer_x < 300) {
	#	$timer_x++;
	#} 
	
	#### Save primers to database IF primer design succesful
	if (defined ($PrimerSetRef->{PRIMER_PRODUCT_SIZE})) {
		&SavePrimerSet($PrimerSetRef);
	}else{
		print STDERR "Primer3 -> Unable to design primers for seq ".$ESTSSRrow->{ESTSSRid}." from set $g_idx with accession ".$ESTSSRrow->{seqid}." \n";
	}
	$records++;

}  # END BIG LOOP



# Finish second statement (that was writing to DB)
$sth_save_primers->finish;



### DISCONNECT FROM DB
$dbh->disconnect or warn "Disconnection from DB failed at end of script: $DBI::errstr\n";

$records=$records-1;

print "\nThe number of records processed was $records.    bye\n";

# remove temp file
# system ('del .\*.out');


###################### END OF MAIN ################







########
sub QueryDB {

	# Design Query
	# Parameters to bind are: 
	#  biodatabase number :  3 hvgi, 4 osgi , 5 tagi
	# we want to deal with three separate species gene indices tables, => substitute with global $g_idx .  
	my $gidx_statement=qq[
	select a.id as ESTSSRid, a.seqid, a.SSRType, a.len, a.score, a.motif
	  , a.start-1, a.start, a.end, c.seq_length-a.end, c.biosequence_str
	from EST_SSR.$g_idx  a
	  inner join biosequence.bioentry b on a.seqid=b.accession
	  inner join biosequence.biosequence c on b.bioentry_id = c.bioentry_id
	where score >=90
	  AND  start >= 50 AND c.seq_length-a.end>=50
	  AND b.biodatabase_id = ?  # 3 hvgi, 4 osgi , 5 tagi
	  AND (
			(a.SSRType="dinucleotide"    AND len >=18)
		 OR (a.SSRType="trinucleotide"   AND len >=18)
		 OR (a.SSRType="tetranucleotide" AND len >=16)
		 OR (a.SSRType="pentanucleotide" AND len >=20)
	 )
	order by a.id, a.seqid , a.start
	];

	#print STDERR "Statement is:\n$gidx_statement\n";


	$sth = $dbh->prepare($gidx_statement);
	#$sth->bind_param( 1, $idx_spp); 

	# Execute query.
	eval{
	$sth->execute($idx_spp);  # $idx_spp is defined above as integer (3 , 4 or 5)
	};
		die "\n\tBIG Problem!,  can't execute query. error is:\n $@ \n"  if ($@);



	my $array_ref= $sth->fetchall_arrayref(\%table_columns); # only the selected colums, as array of hashes 
	# if left empty, gets all columns
	# my $array_ref= $sth->fetchall_arrayref()

	# OR select colums by number, returning ref to array of arrays ...
	#my $array_ref= $sth->fetchall_arrayref( [0,1,4,2,3,9] );

	# The returned database table has the following columns:
	# [0] seqid
	# [1] SSRType
	# [2] len
	# [3] score
	# [4] motif
	# [5] a.start-1
	# [6] start
	# [7] end
	# [8] c.seq_length-a.end
	# [9] biosequence_str

	my $table1size= @$array_ref;
	print STDERR "The number of rows in result set is $table1size\n\n";


	$sth->finish;
	return $array_ref;
}
####
###### end sub
####







########
sub MakePrimerSet {  # takes a sequence object ref and returns a ref to a PrimerPair object.

	# see script "run_primer3.pl"  for details and explanations

	# Get seq obj ref
	my $ThisSeqObj = shift;   # We do not check for content, we trust calling agent to turn in a seq ref
	my $SSRstart = shift;
	my $SSRlen = shift;
	my $SSRtarget = $SSRstart.",".$SSRlen;

	my $temp_file = "temp.".$ThisSeqObj->id().".out";

	# create primer3 obj
	my $primer3 = Bio::Tools::Run::Primer3->new(-seq=>$ThisSeqObj,
												  -outfile=>$temp_file
												);

	unless ($primer3->executable) {print STDERR "primer3 can not be found. Is it installed?\n"; exit(-1)}


	my $temp = $ThisSeqObj->length()-20;
	my $includeOnly = "10,".$temp;

	 # set the maximum and minimum Tm of the primer and all other settings
	  $primer3->add_targets('TARGET'=> $SSRtarget ,
							'INCLUDED_REGION'=> $includeOnly ,
							'PRIMER_MIN_TM'=>57 ,
							'PRIMER_OPT_TM'=>60 ,
							'PRIMER_MAX_TM'=>65 ,
							'PRIMER_MAX_DIFF_TM'=>10 ,
							'PRIMER_MIN_GC'=> 30 ,
							'PRIMER_MAX_GC'=> 70 ,
							'PRIMER_GC_CLAMP'=> 0 ,
							'PRIMER_NUM_RETURN'=>1 ,
							'PRIMER_FIRST_BASE_INDEX'=>1 ,
							'PRIMER_EXPLAIN_FLAG'=>1 ,
							'PRIMER_OPT_SIZE'=>20 ,
							'PRIMER_MIN_SIZE'=>18 ,
							'PRIMER_MAX_SIZE'=>25 ,
							'PRIMER_NUM_NS_ACCEPTED'=> 0 ,
							'PRIMER_MAX_POLY_X'=> 5 ,
							'PRIMER_LIBERAL_BASE'=> 0 ,
							'PRIMER_PRODUCT_SIZE_RANGE'=>'100-300',
							'PRIMER_PRODUCT_OPT_SIZE'=> 150
							);

	# design the primers. This runs primer3 and returns a 
	# Bio::Tools::Primer3 object with the results
	my $results=$primer3->run;

	# Read the results
	my $all_results=$results->all_results;


	return $all_results;
}

####
###### end sub
####





########
sub SavePrimerSet {

my $all_results = shift;  # Gets the ref to the primer_set object

#  Write the contents of the results
#print "These are the primer design results for this sequence: \n";
#print "$all_results->{PRIMER_SEQUENCE_ID}\n";
#print "$all_results->{PRIMER_LEFT}\t$all_results->{PRIMER_LEFT_SEQUENCE}\t$all_results->{PRIMER_LEFT_TM}\n";
#print "$all_results->{PRIMER_RIGHT}\t$all_results->{PRIMER_RIGHT_SEQUENCE}\t$all_results->{PRIMER_RIGHT_TM}\n";
#print "And PRIMER_PRODUCT_SIZE is $all_results->{PRIMER_PRODUCT_SIZE}\n";

my ($LeftPrimerStart,$LeftPrimerLength) = split(',' , $all_results->{PRIMER_LEFT});
my ($RightPrimerStart,$RightPrimerLength) = split(',' , $all_results->{PRIMER_RIGHT});
my  $PrimerSetRank =1;   # fixed for now to 1, and designing only one primer set

#print "\n Left Primer starts at base $LeftPrimerStart and is $LeftPrimerLength bases long \n";
#print  "Right Primer starts at base $RightPrimerStart and is $RightPrimerLength bases long \n\n";

#### The database table EST_SSR.pcr_primer_set contains the following fields
# 1 gene_index_name
# 2 est_ssr_id
# 3 primer_set_rank
# 4 forw_primer_seq   -- left primer
# 5 forw_primer_start
# 6 forw_primer_length
# 7 forw_primer_tm
# 8 forw_primer_GC
# 9 rev_primer_seq    -- right primer
# 10 rev_primer_start
# 11 rev_primer_length
# 12 rev_primer_tm
# 13 rev_primer_GC
# 14 expected_product_size
# 15 id  ->  do not use,  it is autoincrement

# The QUERY to write to database has been prepared in the main function, prior to the big loop.
#        1           2               3                     4                                 5                6                        7                                  8                                     9                                10                    11                      12                                13                                       14
#      $g_idx , $seqObj->id(), $PrimerSetRank, $all_results->{PRIMER_LEFT_SEQUENCE}, $LeftPrimerStart, $LeftPrimerLength , $all_results->{PRIMER_LEFT_TM}, $all_results->{PRIMER_LEFT_GC_PERCENT}, $all_results->{PRIMER_RIGHT_SEQUENCE} , $RightPrimerStart, $RightPrimerLength , $all_results->{PRIMER_RIGHT_TM} , $all_results->{PRIMER_RIGHT_GC_PERCENT} ,$all_results->{PRIMER_PRODUCT_SIZE} 
####

eval{
	$sth_save_primers->execute( $g_idx , $seqObj->id(), $PrimerSetRank, $all_results->{PRIMER_LEFT_SEQUENCE}, $LeftPrimerStart, $LeftPrimerLength , $all_results->{PRIMER_LEFT_TM}, $all_results->{PRIMER_LEFT_GC_PERCENT}, $all_results->{PRIMER_RIGHT_SEQUENCE} , $RightPrimerStart, $RightPrimerLength , $all_results->{PRIMER_RIGHT_TM} , $all_results->{PRIMER_RIGHT_GC_PERCENT} ,$all_results->{PRIMER_PRODUCT_SIZE} );
}; 
print STDERR "BIG Problem!,  can't insert data for record $records into table. \n\n"  if ($@);


}

####
###### end sub
####
