#!/usr/bin/env python

# To run this script the following must be installed;
# ClustalW, available at ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/
# eGenix, available at http://www.egenix.com/files/python/eGenix-mx-Extensions.html
# Biopython, available at http://www.biopython.org/download



# standard libraries
import sys
import string
import re
import os

# biopython
from Bio.Alphabet import IUPAC
from Bio import Clustalw
from Bio.Clustalw import MultipleAlignCL
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# Ensures that the script has a target
if len(sys.argv) < 2:
    print 'Usage: FAB.py <*.pdb>'
    print 'Not enough arguments detected'
    sys.exit(1)

# These fasta files will be needed for the alignment (obviously the location needs to be changed for most users)
command = 'cp /home/crichton/scripts/Light.fasta Light.fasta'
os.popen(command)

command = 'cp /home/crichton/scripts/Heavy.fasta Heavy.fasta'
os.popen(command)


#defining name variables
PDBfile = sys.argv[1]

Lname = string.split(PDBfile, '.')[0]
Lfasta = Lname + '.L.fasta'

Hname = string.split(PDBfile, '.')[0]
Hfasta = Hname + '.H.fasta'

FABname =  Hname + '.fab'

#List Chains
pdb = open(PDBfile, 'r').readlines()
chains = []

counter = 0
for item in pdb:
    if len(item) <= 50:
        pass
    elif item[13:16] == 'N  ' and item[21] and counter == 0:     
        chains.append(item[21])
        counter = 1
    elif item[13:16] == 'N  ' and item[21] != chains[-1]:
        chains.append(item[21])

#Extract Chains
pdb = open(PDBfile, 'r').readlines()
for i in chains:
    Cname = i + '.pdb'
    output = open(Cname, 'w')
    for item in pdb:
        if len(item) <= 50:
            pass
        elif item[21] == i  and item[0:3] != 'TER':
            output.write(item)
    output.close()

# buffer clearing (The script used to scream if I didn't include this, haven't tested it lately)
pdb = open(PDBfile, 'r').readlines()
Cname = 'blah' + '.pdb'
output = open(Cname, 'w')
for item in pdb:
    if len(item) <= 50:
        pass
    elif item[13:16] == 'N  ' and item[21] == i:
        output.write(item)
output.close()

#Write Fastas for all chains
for i in chains:
    Cname = i + '.pdb'
    input = open(Cname, 'r').readlines()
    fasta = i + '.fasta'
    output = open(fasta, 'w')
    output.write('>' + Hname + ' ' + i + '\n')
    for item in input:
        if item[17:20] == 'ALA' and item[13:16] == 'N  ':
            output.write('A')
        elif item[17:20] == 'CYS' and item[13:16] == 'N  ':
            output.write('C')
        elif item[17:20] == 'ASP' and item[13:16] == 'N  ':
            output.write('D')
        elif item[17:20] == 'GLU' and item[13:16] == 'N  ':
            output.write('E')
        elif item[17:20] == 'PHE' and item[13:16] == 'N  ':
            output.write('F')
        elif item[17:20] == 'GLY' and item[13:16] == 'N  ':
            output.write('G')
        elif item[17:20] == 'HIS' and item[13:16] == 'N  ':
            output.write('H')
        elif item[17:20] == 'ILE' and item[13:16] == 'N  ':
            output.write('I')
        elif item[17:20] == 'LYS' and item[13:16] == 'N  ':
            output.write('K')
        elif item[17:20] == 'LEU' and item[13:16] == 'N  ':
            output.write('L')
        elif item[17:20] == 'MET' and item[13:16] == 'N  ':
            output.write('M')
        elif item[17:20] == 'ASN' and item[13:16] == 'N  ':
            output.write('N')
        elif item[17:20] == 'PRO' and item[13:16] == 'N  ':
            output.write('P')
        elif item[17:20] == 'GLN' and item[13:16] == 'N  ':
            output.write('Q')
        elif item[17:20] == 'ARG' and item[13:16] == 'N  ':
            output.write('R')
        elif item[17:20] == 'SER' and item[13:16] == 'N  ':
            output.write('S')
        elif item[17:20] == 'THR' and item[13:16] == 'N  ':
            output.write('T')
        elif item[17:20] == 'VAL' and item[13:16] == 'N  ':
            output.write('V')
        elif item[17:20] == 'TRP' and item[13:16] == 'N  ':
            output.write('W')
        elif item[17:20] == 'TYR' and item[13:16] == 'N  ':
            output.write('Y')         
output.close()



#Determine light/heavy chains

ChainL = ''
ChainH = ''

print chains

l1s = re.compile('[ILM][TAS]C')
l1e = re.compile('W[FYLV][QL]')

l2s = re.compile('[WLI][IVK][YGF]') 
l2e = re.compile('[EG][VI]P')

l3s = re.compile('Y[YF]C')
l3e = re.compile('FG[SAGTQ]')

h1s = re.compile('C[KTAS][AVTD][TS]')
h1e = re.compile('W[FIV][KRA][EQTK]')

h2s = re.compile('[WY][ILVM][GASN]')
h2e = re.compile('Y[RNSDTPALH][ESPADGQ][KASTRHD][FLV][KQM]')

h3s = re.compile('C[LTVA][SHRKNG-]')
h3e = re.compile('W[G ][QA ]')

for i in chains:
    fname = i + '.fasta'
    input = open(fname, 'r').read()  
    if l1s.search(input) != None and l1e.search(input) != None and l2s.search(input) != None and l2e.search(input) != None and l3s.search(input) != None and l3e.search(input) != None and ChainL == '':
        ChainL = i
    elif h1s.search(input) != None and h1e.search(input) != None and h2s.search(input) != None and h2e.search(input) != None and h3s.search(input) != None and h3e.search(input) != None  and (i != ChainL):
        ChainH = i   

if ChainL == 'L' and ChainH != 'H' and 'H' in chains:
    ChainH = 'H'
elif ChainH == 'H' and ChainL != 'L' and 'L' in chains:
    ChainL = 'L'
elif ('L' in chains and 'H' in chains) and (ChainH == '' or ChainL == ''):
    ChainL = 'L'
    ChainH = 'H'

if ChainL == 'A' and ChainH == '' and 'B' in chains:
    ChainH = 'B'
if ChainH == 'B' and ChainL == '' and 'A' in chains:
    ChainL = 'A'



if ChainH == '' or ChainL == '':
    print 'Target is not an antibody-antigen file, no need to make a fab file.'
    sys.exit()

print ChainL + ' is the light chain' 
print ChainH + ' is the heavy chain'

#Rewrite PDB (neccesary to ensure that the order of the pdb is (antigen), (light chain), (heavy chain)
output = open(PDBfile, 'w')
for i in chains:
    if i != ChainL and i != ChainH:
        Cname = i + '.pdb'
        input = open(Cname, 'r').readlines()
        for x in input:
            output.write(x)

output.write('TER' + '\n')

Cname = ChainL + '.pdb'
input = open(Cname, 'r').readlines()
for x in input:
    output.write(x)

Cname = ChainH + '.pdb'
input = open(Cname, 'r').readlines()
for x in input:
    output.write(x)

output.write('TER')
output.close

Lfasta = ChainL + '.fasta'
Hfasta = ChainH + '.fasta'

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass a second argument to MultipleAlignCL with the complete
# path to clustalw

#Create copies of the antibodies and add the chain-fastas to them

Main = open('Light.fasta', 'r').read()
App = open(Lfasta, 'r').read()
output = open('PreLight.aln', 'w')
output.write('\n')


output.write(Main)
output.write(App)
output.close

Main = open('Heavy.fasta', 'r').read()
App = open(Hfasta, 'r').read()
output = open('PreHeavy.aln', 'w')
output.write(Main)
output.write(App)
output.close


Lalnname= Hname + '.l.aln'

Laln = Hname +'.l.aln'
Haln = Hname +'.h.aln'

#Create the command line for the light chain
cline = MultipleAlignCL(os.path.join(os.curdir, 'PreLight.aln'))
cline.set_output(Laln)
cline.gap_open_pen = 10
cline.gap_ext_pen = 0.1
cline.set_protein_matrix('GONNET')

# actually perform the alignment and get back an alignment object
alignment = Clustalw.do_alignment(cline) 
print str(cline)

all_records = alignment.get_all_seqs()


#Read alignment file and write aligned sequence to a pseudo-fasta
text = open(Laln, 'r')
lines = text.readlines()
NewLname = Hname + '.lnew.fasta'
output = open(NewLname, 'w')
output.write('>' +  Hname + '\n')
for line in lines:
    if line[0:4] == Hname:
        output.write(line[16:])
output.close

#Create the command line for the heavy chain
cline = MultipleAlignCL(os.path.join(os.curdir, 'PreHeavy.aln'))
cline.set_output(Haln)
cline.gap_open_pen = 10
cline.gap_ext_pen = 0.1
cline.set_protein_matrix('GONNET')

# actually perform the alignment and get back an alignment object
alignment = Clustalw.do_alignment(cline) 
print str(cline)

all_records = alignment.get_all_seqs()

#Read for location of CDR terminii
text = open(Laln, 'r').readlines()
output = open('Lifasta', 'w')
for line in text:
    if line[0:4] == '1oak':
        output.write(line[16:])
output.close

#Read alignment file and write aligned sequence to a pseudo-fasta
text = open(Haln, 'r')
lines = text.readlines()
NewHname = Hname + '.hnew.fasta'
output = open(NewHname, 'w')
output.write('>' +  Hname + '\n')
for line in lines:
    if line[0:4] == Hname and len(line) > len(Hname):
        output.write(line[16:])
output.close

#Determine location of L-chain CDR's
lizzle = open('Lifasta', 'r')
L = lizzle.read()

L1_start = l1s.search(L).start() + 3
L1_end = l1e.search(L).start()
L2_start = l2s.search(L).start() + 3
L2_end = l2e.search(L).start()   - 1
L3_start = l3s.search(L).start() + 2
L3_end = l3e.search(L).start() - 1      

#Read for location of CDR terminii
text = open(Haln, 'r').readlines()
output = open('Hefasta', 'w')
for line in text:
    if line[0:4] == '1oak':
        output.write(line[16:])
output.close

#I shouldn't have to do this, but there's some kind of buffer issue
text = open(Laln, 'r').readlines()
output = open('Lifasta', 'w')
for line in text:
    if line[0:4] == '1bql':
        output.write(line[16:])
output.close


#Determine location of H-chain CDR's
hizzle = open('Hefasta', 'r')
H = hizzle.read()

H1_start = h1s.search(H).start() + 4  
H1_end = h1e.search(H).start() - 3
H2_start = h2s.search(H).start() + 3
H2_end = h2e.search(H).start() - 1   
H3_start = h3s.search(H).start()
H3_end = h3e.search(H).start() - 3


#Create the FAB File

length1 = open(NewLname, 'r').read()
length2 = open(NewHname, 'r').read()

L_length = len(length1)
H_length = len(length2)

Seq_length = len(length1) + len(length2)

print 'The alignment contains ' + str(Seq_length) + ' residues'

FABname = Hname + '.fab'
output = open(FABname, 'w')

light = open(NewLname, 'r')
llines = light.readlines()

heavy = open(NewHname, 'r')
hlines = heavy.readlines()


#Cut off lines that are nothing but extraneous gaps at the end of the alignment

x = re.compile('[ACDEFGHIKLMNPQRSTVWY]')

while x.search(llines[-1]) == None:
    del llines[-1]

while x.search(hlines[-1]) == None:
    del hlines[-1]

#Ensure that the evil 'extra Alanine' bug doesn't rear its ugly head
if re.search('A-', llines[-1]) != 'None' and re.search('A-', hlines[-1]) != 'None':
    llines[-1] = llines[-1].replace('A-', '--') 
    hlines[-1] = hlines[-1].replace('A-', '--') 
llines[-1] = llines[-1].replace('AA', 'A-')

#begin the actual construction of the TFN

False = '.......................................................................................................................................'
True = 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT'

F1_length = L1_start - 5
T1_length = L1_end - L1_start 
F2_length = L2_start -  L1_end -6
T2_length = L2_end - L2_start 
F3_length = L3_start - L2_end - 6
T3_length = L3_end - L3_start 
F4_length = L_length - L3_end - 9

F5_length = H1_start - 6
T4_length = H1_end - H1_start 
F6_length = H2_start -  H1_end - 7
T5_length = H2_end - H2_start 
F7_length = H3_start - H2_end - 9
T6_length = H3_end - H3_start 
F8_length = H_length - H3_end - 9


L_TFN = 'NN' + False[0:F1_length] + 'NNN' + True[0:T1_length] + 'NNN' + False[0:F2_length] + 'NNN' + True[0:T2_length] + 'NNN' + False[0:F3_length] + 'NNN' + True[0:T3_length] + 'NNNNNNNNN' + False[0:F4_length]

H_TFN = 'NN' + False[0:F5_length] + 'NNNN' + True[0:T4_length] + 'NNNN' + False[0:F6_length] + 'NNN' + True[0:T5_length] + 'NNNNNN' + False[0:F7_length] + 'NNN' + True[0:T6_length] + 'NNNNNNNNN' + False[0:F8_length]

align_length = len(L_TFN) + len(H_TFN)
print 'The TFN is ' + str(align_length) + ' charecters'

output.write(Hname + ' L          ' + llines[1])
output.write('TFN             ' + L_TFN[0:60] + '\n')

output.write(Hname + ' L          ' + llines[2])
output.write('TFN             ' + L_TFN[60:120] + '\n')

if len(llines) >= 4:
    output.write(Hname + ' L          ' + llines[3])
    length = len(llines[3])
    end = length + 119
    output.write('TFN             ' + L_TFN[120:end] + '\n')

if len(llines) >= 5:
    output.write(Hname + ' L          ' + llines[4])
    length = len(llines[4])
    end = length + 179
    output.write('TFN             ' + L_TFN[180:end] + '\n')

output.write(Hname + ' H          ' + hlines[1])
output.write('TFN             ' + H_TFN[0:60] + '\n')

output.write(Hname + ' H          ' + hlines[2])
output.write('TFN             ' + H_TFN[60:120] + '\n')

output.write(Hname + ' H          ' + hlines[3])
output.write('TFN             ' + H_TFN[120:180] + '\n')

if len(hlines) >= 5:
    output.write(Hname + ' H          ' + hlines[4])
    length = len(hlines[4])
    end = length + 179
    output.write('TFN             ' + H_TFN[180:end] + '\n')


if len(hlines) >= 6:
    output.write(Hname + ' H          ' + hlines[5])
    length = len(hlines[5])
    end = length + 239
    output.write('TFN             ' + H_TFN[240:end] + '\n')

if len(hlines) >= 7:
    output.write(Hname + ' H          ' + hlines[6])
    length = len(hlines[6])
    end = length + 299
    output.write('TFN             ' + H_TFN[300:end] + '\n')

output.close

#Remove extraneous files created by the script
os.remove(NewHname)
os.remove(NewLname)
os.remove('PreLight.aln')
os.remove('PreHeavy.aln')
os.remove('PreLight.dnd')
os.remove('PreHeavy.dnd')
os.remove(Laln)
os.remove(Haln)
for i in chains:
    os.remove(i + '.pdb')
os.remove('blah.pdb')
for i in chains:
    os.remove(i + '.fasta')
os.remove('Heavy.fasta')
os.remove('Light.fasta')
os.remove('Hefasta')
os.remove('Lifasta')
