#!/usr/bin/perl
#
# Combine the data in "genotypes.txt", "markers.txt" and "families.txt" 
# and convert into a CRI-MAP .gen file.

# file names
$gfile = "genotypes.txt";  # genotype data
$mfile = "markers.txt";    # list of markers, in order
$ffile = "families.txt";   # family information 
$ofile = "data.gen";       # output file

# read marker names and place in the vector @markers
open(IN, $mfile) or die("Cannot read from $mfile");
while($line = <IN>) {
    chomp($line);
    push(@markers, $line);
}
close(IN);

# read family information and place in the hashes %dad, %mom, %sex
open(IN, $ffile) or die("Cannot read from $ffile");
$line = <IN>;
while($line = <IN>) {
    chomp($line);
    @v = split(/\s+/, $line);
    if($v[0] eq "") { shift @v; }
    
    ($fam, $ind, $dad, $mom, $sex) = @v;
    
    $dad{$fam}{$ind} = $dad;
    $mom{$fam}{$ind} = $mom;

    if($sex == 2) {   # make female = 0 rather than = 2 
	$sex{$fam}{$ind} = 0;
    }
    else {  
	$sex{$fam}{$ind} = $sex;
    }
}
close(IN);

# read genotype data
open(IN, $gfile) or die("Cannot read from $gfile");

# header line : parse family/individual IDs
$line = <IN>; chomp($line);
@v = split(/\s+/, $line);
if($v[0] eq "") { shift @v; } # if first entry is blank, get rid of it
shift @v; # get rid of the first entry "Marker"
foreach $v (@v) {
    ($fam,$ind) = split(/-/, $v); # split at '-'
    @fam = (@fam, $fam);
    @ind = (@ind, $ind);
}

# parse rest of file 
while($line = <IN>) {

    $marker = substr($line, 0, 9);
    $marker =~ s/\s+//g; # remove any extra spaces
    
    foreach $i (0..(@fam-1)) {
	$g = substr($line, 9+$i*7, 7); 
	($g1, $g2) = split(/\//, $g);  # split at '/'
	$g1 =~ s/\s+//g; # remove any extra spaces
	$g2 =~ s/\s+//g;

	if($g1 == 0 or $g2 == 0) { # replace blanks with 0's
	    $g1 = 0;
	    $g2 = 0;
	}

	$gen{$fam[$i]}{$ind[$i]}{$marker} = $g1 . " " . $g2;
	
    }
}
close(IN);


# write genotype data as cri-map .gen file
open(OUT, ">$ofile") or die("Cannot write to $ofile");

$nfam = keys %gen;      # number of families
print OUT ("$nfam\n");
$nmar = @markers;       # number of markers
print OUT ("$nmar\n");
foreach $mar (@markers) {
    print OUT ("$mar\n");
}

foreach $fam (sort numerically keys %gen) {  # families in numerical order
    print OUT ("$fam\n");
    $nind = keys %{$gen{$fam}};  # number of individuals in family
    print OUT ("$nind\n");

    foreach $ind (sort numerically keys %{$gen{$fam}}) {
	print OUT ("$ind $mom{$fam}{$ind} $dad{$fam}{$ind} $sex{$fam}{$ind}\n");
	
	foreach $mar (@markers) {  # markers in order
	    print OUT ("$gen{$fam}{$ind}{$mar} ");
	}

	print OUT ("\n");
    }
}
close(OUT);


# subroutine to allow sorts by numerical order
sub numerically { $a <=> $b; }