#! /usr/bin/perl

# This script reads a DSSP file and writes a new file in a simple format:
# protein name, AR sequence, SS sequence, SEA sequence, PDB numbering
#
# Alexey Porollo
# Department of Environmental Health
# University of Cincinnati College of Medicine
# and
# Division of Biomedical Informatics
# Children's Hospital Research Foundation
#
# Cincinnati, OH
# USA

# constants for the parsing of DSSP output
use constant FAKE_NUM => -100000;
use constant PDBNUM_POS => 5;
use constant PDBNUM_LEN => 5;
use constant PDBNUM_INSCODE_POS => 10;
use constant PDBNUM_INSCODE_LEN => 1;
use constant CHAIN_POS => 11;
use constant CHAIN_LEN => 1;
use constant AA_POS => 13;
use constant AA_LEN => 1;
use constant CHAIN_BREAK_POS => 14;
use constant CHAIN_BREAK_LEN => 1;
use constant SS_POS => 16;
use constant SS_LEN => 1;
use constant RSA_POS => 35;
use constant RSA_LEN => 3;
use constant PHI_POS => 103;
use constant PHI_LEN => 6;
use constant PSI_POS => 109;
use constant PSI_LEN => 6;
use constant X_POS => 115;
use constant X_LEN => 7;
use constant Y_POS => 122;
use constant Y_LEN => 7;
use constant Z_POS => 129;
use constant Z_LEN => 7;

%basic_surface = (A=>115, R=>225, D=>150, N=>160, C=>135,
                  E=>190, Q=>180, G=>75,  H=>195, I=>175,
                  L=>170, K=>200, M=>185, F=>210, P=>145,
                  S=>115, T=>140, W=>255, Y=>230, V=>155, X=>1);

%basic_surface_brnn = (A=>106, R=>248, D=>163, N=>157, C=>135,
                       E=>194, Q=>198, G=>84,  H=>184, I=>169,
                       L=>164, K=>205, M=>188, F=>197, P=>136,
                       S=>130, T=>142, W=>227, Y=>222, V=>142, X=>1);


$type = shift;
$type = "\L$type";
$file_mask = shift;
$chains = shift;

if ($type !~ /norsa|sea|surf|pdb/i || !file_mask || $chains !~ /all|first/i) {
    print "USAGE: perl dssp2seq.pl type file_name_mask all|first\n";
    print "TYPE of output:\n";
    print "norsa - no any RSA data\n";
    print "sea   - real valued RSA\n";
    print "surf  - 1-digit integer RSA rounded in SABLE way\n";
    print "pdb   - real valued RSA + PDB numbering\n";
    print "EXAMPLE: perl dssp2seq.pl pdb '*.dssp' all\n";
    exit 1;
}

open(SEQ, ">pdb2dssp.$type") || die "Can't create output file pdb2dssp.$type: $!\n";

my %data = ();
my $protein_name = '';
# look for all files under given file mask
@filenames = glob($file_mask);
foreach $filename (@filenames) {
    next unless (-f $filename);
    print "Processing file $filename... ";

    my %dssp = ParseDSSP($filename, -1, 1);

    if ($filename =~ /^pdb/) {
	# for pdb files
	$protein_name = substr($filename, 3, index($filename, '.') - 3);
    }
    else {
	# for any other file names
	$protein_name = substr($filename, 0, rindex($filename, '.'));
    }

    # print data
    foreach my $ch (sort keys %dssp) {
	next if $chains =~ /first/i and $ch ne $dssp{FirstChain};
	next if $chains =~ /all/i and $ch eq 'FirstChain';
	next if length($dssp{$ch}{AR}) == 0;

	print SEQ ($chains =~ /first/i) ? ">$protein_name\n" : ">$protein_name\_$ch\n";
	print SEQ "\U$dssp{$ch}{AR}\n$dssp{$ch}{SS}\n";

	if ($type eq 'surf') {
	    for (my $i=0; $i<length($dssp{$ch}{AR}); $i++) {
		my $surf = substr($dssp{$ch}{SURF}, $i, 1);
		if ($dssp_log && $dssp_info{$protein_name}{$ch}{$dssp{$ch}{NUM}[$i]}) {
		    $surf = '-';
		}
		print SEQ $surf;
	    }
	    print SEQ "\n";
	}
	if ($type =~ /sea|pdb/) {
	    for my $i (0..$#{$dssp{$ch}{SEA}}) {
		my $surf_ratio = $dssp{$ch}{SEA}[$i] / 
		                 $basic_surface{substr($dssp{$ch}{AR}, $i, 1)};
#                                $basic_surface_brnn{substr($dssp{$ch}{AR}, $i, 1)};
		$surf_ratio = 1 if ($surf_ratio > 1);
		$surf_ratio = -1 if (substr($dssp{$ch}{AR}, $i, 1) eq X);

		if ($dssp_log && $dssp_info{$protein_name}{$ch}{$dssp{$ch}{NUM}[$i]}) {
		    $surf_ratio = -$surf_ratio;
		}

		printf SEQ " %2.2f", $surf_ratio;
	    }
	    print SEQ "\n";
	}
	if ($type eq 'pdb') {
	    print SEQ "@{$dssp{$ch}{NUM}}\n";
	}
    }
    # close current file
    print "Done\n";
}

close SEQ;

print "See file pdb2dssp.$type for results.\n";


# ParseDSSP reads the given file and
# gets AR, SS and SEA data
# INPUT: file name,
#        start of numeration (-1 - original PDB),
#        way of rounding,
#        keep SS as is
# RETURN: hash Chain - AR/SS/SEA/SURF/NUM
#
# ways of rounding SA to 0..9:
# 0 - mathematical, 0->[0, 0.05) 1->[0.05, 0.15) ... 9->[0.85, 1]
# 1 - SABLE, 0->[0, 0.1) 1->[0.1, 0.2) ... 9->[0.9, 1]
# NEW: treats correctly insertion codes and gaps in numeration
sub ParseDSSP {
    my $fname = shift;
    my $start = shift;
    my $way = shift;
    my $keep = shift;
    my %data;
    my %ss_bridge;
    my %num_shift;

    # open current file to be converted
    if (!open(DSSP, $fname)) {print " $! "; return %data;}
    my @lines = <DSSP>;
    # close current file
    close DSSP;

    # find the beginning of lines with residue data
    do {
	$_ = shift @lines;
    } until /RESIDUE AA STRUCTURE BP1 BP2  ACC/ || scalar(@lines) == 0;
    return %data if scalar(@lines) == 0;

    my ($first_chain, $chain_id, $gap) = '';
    # enumerate data lines
    for my $i (0 .. $#lines) {
	my $l = $lines[$i];
	# get data
	my ($num, $ins_code, $aa, $ss, $sea, $rsa, $ca_x, $ca_y, $ca_z, $phi, $psi) = '';
	# amino acid
	my $aa = substr($l, AA_POS, AA_LEN);
	# is it a sequence break?
	if ($aa eq '!') {
	    # chain break?
	    next if substr($l, CHAIN_BREAK_POS, CHAIN_BREAK_LEN) eq '*';
	    # raise a gap flag
	    $gap = 1;
	    next;
	}
	# chain ID
	$chain_id = substr($l, CHAIN_POS, CHAIN_LEN);
	$first_chain = $chain_id unless $first_chain;
	# PDB number and insertion code if any
	$num = substr($l, PDBNUM_POS, PDBNUM_LEN);
	$num =~ s/ //g;
	$ins_code = substr($l, PDBNUM_INSCODE_POS, PDBNUM_INSCODE_LEN);
	$ins_code = '' if $ins_code eq ' ';
	# process weird cases in numeration
	my $num_last = ($#{$data{$chain_id}{NUM}} >= 0) ? $data{$chain_id}{NUM}[$#{$data{$chain_id}{NUM}}] : FAKE_NUM;
	my $num_next = (defined $lines[$i + 1] && 
			substr($lines[$i + 1], CHAIN_POS, CHAIN_LEN) eq $chain_id &&
			substr($lines[$i + 1], PDBNUM_POS, PDBNUM_LEN) =~ /\d+/) ? $& : FAKE_NUM;
	if ($num_last > FAKE_NUM && $num_next > FAKE_NUM) {
	    # get rid of mistypings, e.g. 1azz_D: 46,27,48
	    if (($num < $num_last && $num < $num_next) ||
		($num > $num_last && $num > $num_next)) {
		$num = $num_last + 1 if $num_next - $num_last == 2;
	    }
	}
	# get rid of weird insertions, e.g. 1i6v_D: 155,?????,452
	next if ($num - $num_last != 1 && $ins_code eq 'U');
	# fill gap if any
	if ($gap && $#{$data{$chain_id}{NUM}} >= 0) {
	    for my $n ($data{$chain_id}{NUM}[$#{$data{$chain_id}{NUM}}] + 1 .. $num - 1) {
		$data{$chain_id}{AR} .= 'X';
		$data{$chain_id}{SS} .= 'X';
		$data{$chain_id}{SURF} .= 9;
		push @{$data{$chain_id}{SEA}}, -1;
		push @{$data{$chain_id}{NUM}}, $n;
		push @{$data{$chain_id}{CAX}}, 999.900;
		push @{$data{$chain_id}{CAY}}, 999.900;
		push @{$data{$chain_id}{CAZ}}, 999.900;
		push @{$data{$chain_id}{PHI}}, 0;
		push @{$data{$chain_id}{PSI}}, 0;
	    }
	}
	# drop a gap flag
	$gap = 0;
	# remember shift for first residue in chain
	if ($start >= 0 && !defined $num_shift{$chain_id}) {
	    $num_shift{$chain_id} = $start - $num;
	}

	# ===> cases under the question
	# ---> single insertions
	# 1ggp_A:238,603,239
	# 1ggp_B:159,604,160...162,605,163
	# 1k9o_E:184,1184,185...188,1188,189...221,1221,222
	# 1axs_H,B:52,521,53
	# ---> multiple insertions
	# 1ggp_A:209,601,602,210
	# 1axs_H,B:82,821,822,823,83...100,1001,1002,1003,101
	# ---> 'super'-numbering
	# 1jr3_B:9,2010,..,2069,70; 1jr3_C:9,5010,..,5017,18
	# ---> block insertions
	# 1ngv_B:384!800,..,806!900,..,907!426
	# 1ngv_D:382!800,..,808!900,..,904!426
	# ---> splitted chain definition
	# 1orw chains A,B,C,D

	# is it S-S bridge?
	if ($aa =~ /[a-z]/) {
	    my $rn = (defined $ss_bridge{$chain_id}{$aa}{R1}) ? 'R2' : 'R1';
	    $ss_bridge{$chain_id}{$aa}{$rn} = "$num$ins_code";
	    $aa =~ s/[a-z]/C/; # lower case for SS-bridge CYS (DSSP description)
	}
	# modified residue?
	$aa = 'X' unless ($aa =~ /[ARDNCEQGHILKMFPSTWYV]/);

	# secondary structure
	my $ss = substr($l, SS_POS, SS_LEN);
	$ss = 'C' unless $ss =~ /[A-Z]/;
	$ss = SStoHECstate($ss) unless $keep;

	# solvent accessibility
	my $sea = substr($l, RSA_POS, RSA_LEN);
	$sea =~ s/ //g;
	# count RSA
	my $surf_ratio = $sea / $basic_surface{$aa};
	if ($aa eq 'X') {
	    $rsa = 9;
	}
	else {
	    # mathematical
	    if ($way == 0) {
		$surf_ratio = 0.9 if ($surf_ratio > 0.94); # to avoid rounding to 10 and greater
		$rsa = sprintf("%.0f", $surf_ratio * 10);
	    }
	    # SABLE
	    else { # if ($way == 1)
		my $tmp = int $surf_ratio * 10;
		$tmp = 9 if $tmp > 9;
		$rsa = $tmp;
	    }
	}

	# get coordinates
	$ca_x = substr($l, X_POS, X_LEN);
	$ca_x =~ s/ //g;
	$ca_y = substr($l, Y_POS, Y_LEN);
	$ca_y =~ s/ //g;
	$ca_z = substr($l, Z_POS, Z_LEN);
	$ca_z =~ s/ //g;
	# get torsion angles
	$phi = substr($l, PHI_POS, PHI_LEN);
	$phi =~ s/ //g;
	$psi = substr($l, PSI_POS, PSI_LEN);
	$psi =~ s/ //g;

	$data{$chain_id}{AR} .= $aa;
	$data{$chain_id}{SS} .= $ss;
	$data{$chain_id}{SURF} .= $rsa;
	push @{$data{$chain_id}{SEA}}, $sea;
	push @{$data{$chain_id}{NUM}}, "$num$ins_code";
	push @{$data{$chain_id}{CAX}}, $ca_x;
	push @{$data{$chain_id}{CAY}}, $ca_y;
	push @{$data{$chain_id}{CAZ}}, $ca_z;
	push @{$data{$chain_id}{PHI}}, $phi;
	push @{$data{$chain_id}{PSI}}, $psi;
    } # for my $i (0 .. $#lines)

    # update info
    for my $ch (keys %data) {
	# add info about S-S bridges
	%{$data{$ch}{SSBridge}} = %{$ss_bridge{$ch}};
	# shift numbers to given starting number
	if ($start >= 0) {
	    for my $n (@{$data{$ch}{NUM}}) {
		$n =~ /\d+/; # take care of insertion code if any
		$n = $& + $num_shift{$ch};
		$n .= $';
	    }
	    for my $br (keys %{$data{$ch}{SSBridge}}) {
		my $n = $data{$ch}{SSBridge}{$br}{R1};
		$n =~ /\d+/; # take care of insertion code if any
		$n = $& + $num_shift{$ch};
		$n .= $';
		$data{$ch}{SSBridge}{$br}{R1} = $n;
		$n = $data{$ch}{SSBridge}{$br}{R2};
		$n =~ /\d+/; # take care of insertion code if any
		$n = $& + $num_shift{$ch};
		$n .= $';
		$data{$ch}{SSBridge}{$br}{R2} = $n;
	    }
	}
    }
    # remember first met chain ID
    $data{FirstChain} = $first_chain;

    return %data;
}

# converts SS states to HEC states
# INPUT: target string
# RETURN: new string containing only HEC characters
sub SStoHECstate {
  my $str = shift;
  $str = "\U$str";
  $str =~ s/[GI]/H/g;
  $str =~ s/B/E/g;
  $str =~ s/_/X/g;
  $str =~ s/[^HECX]/C/g;
  return $str;
}

