#! /usr/bin/perl
# $Id: gmap_build.pl.in 53700 2011-12-05 19:43:10Z twu $

use warnings;	

$gmapdb = "/var/cache/gmap";
$package_version = "2011-11-30";

use Getopt::Std;
use File::Copy;	

undef($opt_B);	     # binary directory
undef($opt_D);			# destination directory
undef($opt_b);			# offsetscomp basesize
undef($opt_d);			# genome name
undef($opt_k);			# k-mer size for genomic index (allowed: 12-15)
undef($opt_S);			# Old flag for turning off sorting
undef($opt_s);			# Sorting
undef($opt_g);			# gunzip files
$opt_w = 2; # waits (sleeps) this many seconds between steps.  Useful if there is a delay in the filesystem.
getopts("B:D:b:d:k:s:gw:");

if (!defined($bindir = $opt_B)) {
    $bindir = "/usr/bin";
}


if (!defined($basesize = $opt_b)) {
    $basesize = 12;
}

if (!defined($kmersize = $opt_k)) {
    print STDERR "-k flag not specified, so building with default 15-mers\n";
    $kmersize = 15;
}

if (!defined($dbname = $opt_d)) {
  print_usage();
  die "Must specify genome database name with -d flag.";
} elsif ($opt_d =~ /(\S+)\/(\S+)/) {
  $dbname = $2;
  if (defined($opt_D)) {
    $opt_D = $opt_D . "/" . $1;
  } else {
    $opt_D = $1;
  }
}

$dbname =~ s/\/$//;	# In case user gives -d argument with trailing slash

if (!defined($dblocation = $opt_D)) {
    $dblocation = $gmapdb;
}

if (defined($opt_S)) {
    print STDERR "The flag -S is now deprecated.  Using '-s none' instead.\n";
    $chr_order_flag = "-s none";
} elsif (defined($opt_s)) {
    $chr_order_flag = "-s $opt_s";
} else {
    # Default is to order genomes
    print STDERR "Sorting chromosomes in chrom order.  To turn off or sort other ways, use the -s flag.\n";
    $chr_order_flag = "";
}

if (defined($opt_g)) {
    $gunzip_flag = "-g";
} else {
    $gunzip_flag = "";
}


$genome_fasta = join(" ",@ARGV);



#####################################################################################

create_coords();
create_genome_version();
make_contig();
compress_genome();
create_index_offsets();
create_index_positions();
install_db();

exit;


#####################################################################################


sub create_coords {
    $cmd = "$bindir/fa_coords $gunzip_flag -o $dbname.coords $genome_fasta";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub check_coords_exists {
# file exists, and is not empty
    unless (-s "$dbname.coords") {
	die "ERROR: $dbname.coords not found.\n";
    }
    return;
}

sub create_genome_version {
    open GENOMEVERSIONFILE, ">$dbname.version" or die $!;
    print GENOMEVERSIONFILE "$dbname\n";
    close GENOMEVERSIONFILE or die $!;
    sleep($opt_w);
    return;
}

sub make_contig {
    check_coords_exists();
    $cmd = "$bindir/gmap_process $gunzip_flag -c $dbname.coords $genome_fasta | $bindir/gmapindex -k $kmersize -d $dbname -A $chr_order_flag";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub compress_genome {
    $cmd = "$bindir/gmap_process $gunzip_flag -c $dbname.coords $genome_fasta | $bindir/gmapindex -k $kmersize -d $dbname -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub full_ASCII_genome {
    check_coords_exists();
    make_contig();
	
    $cmd = "$bindir/gmap_process $gunzip_flag -c $dbname.coords $genome_fasta | $bindir/gmapindex -k $kmersize -d $dbname -l -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub create_index_offsets {
    $cmd = "cat $dbname.genomecomp | $bindir/gmapindex -k $kmersize -d $dbname -q 3 -O";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub create_index_positions {
    "cat $dbname.genomecomp | $bindir/gmapindex -k $kmersize -d $dbname -q 3 -P";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($opt_w);
    return;
}

sub install_db {
    my @suffixes = (
	"chromosome", 
	"chromosome.iit", 
	"chrsubset", 
	"contig", 
	"contig.iit", 
	"genomecomp", 
	"version");
	
    push @suffixes,sprintf "ref%d%d3gammaptrs",$basesize,$kmersize;
    push @suffixes,sprintf "ref%d%d3offsetscomp",$basesize,$kmersize;
    push @suffixes,sprintf "ref%d%d3positions",$basesize,$kmersize;

    $dblocation = "$dblocation/$dbname";
	
    print STDERR "Copying files to directory $dblocation\n";
    system("mkdir -p $dblocation");
    system("mkdir -p $dblocation/$dbname.maps");
    system("chmod 755 $dblocation/$dbname.maps");
    foreach $suffix (@suffixes) {
	system("mv $dbname.$suffix $dblocation/$dbname.$suffix");
	system("chmod 644 $dblocation/$dbname.$suffix");
    }
    return;
}



sub print_usage {
  print <<TEXT1;

gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
Part of GMAP package, version $package_version.

A simplified alternative to using the program gmap_setup, which creates a Makefile.

Usage: gmap_build [options...] -d <genomename> <fasta_files>

Options:
    -d STRING   genome name
    -D STRING   destination directory for installation (defaults to gmapdb directory specified at configure time)
    -b INT      basesize for offsetscomp (default 12)
    -k INT      k-mer value for genomic index (allowed: 12..15, default 15)
    -s STRING   Sort chromosomes using given method:
                  none - use chromosomes as found in FASTA file(s)
                  alpha - sort chromosomes alphabetically (chr10 before chr 1)
                  numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY
                  chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU
    -g          files are gzipped, so need to gunzip each file first
    -w INT      wait (sleep) this many seconds after each step

TEXT1
  return;
}

