# MARC Workshop at PSC

# Essential Computing for Bioinformatics
#
# Examples by: Stuart Pomerance
# Pittsburgh Supercomputing Center

######################################################################
#
# Function and Class definitions
#
######################################################################

# key:value lookup tables (dictionary)

acid_to_protein = { 
    'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',
    'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
    'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
    'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',

    'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
    'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
    'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
    'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',

    'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
    'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
    'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
    'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',

    'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
    'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
    'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
    'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'
    }

complement = {
    'A':'T', 'C':'G', 'R':'Y', 'X':'X',
    'T':'A', 'G':'C', 'Y':'R' 
    }

class FastaSequence:
    name     = ''
    comment  = ''
    sequence = ''

def read_fasta_file(filename):

    file = open(filename,"r") 

    # prepare a list for the lines of the sequence
    sequence_lines = [ ]
	
    # prepare the sequence list
    sequence = [ ]

    for line in file:

        # remove the trailing '\n' and trailing spaces
        line = line.rstrip('\n ')

        # if the line length is < 1, there nothing to do for this line
        # so move to the next line
        if len( line ) < 1:
            continue
                
        if line[0] == '>': # if the line starts with '>' a new sequence is beginning
                
            # a sequence may have just finished, and if so, then
            # append it to the list
            if len( sequence_lines ) > 0:
                fs.sequence = ''.join(sequence_lines)
                sequence.append(fs)
                
                # create a new container for the sequence
                fs = FastaSequence()
                
                # split the line on spaces so the identifier can be
                # separated from the description
                # a list of words is stored in a
                a = line.split(' ') 
                
                # the 0th element of the list, a, is the identifier
                # and take the slice of that string starting at 1, to remove
                # the leading '>'			
                fs.name = a[0][1:]
                    
                # join the rest of the words in the list, a, back together
                # with spaces and store the result in description
                fs.comment = ' '.join(a[1:])
                    
                # prepare a list for the lines of the sequence
                sequence_lines = [ ]
                    
            else:
                # do NOT use simple string concatenation here!!
                sequence_lines.append(line)
                
                file.close()
                
                # a sequence may have just finished, and if so, then
                # append it to the list
                if len( sequence_lines ) > 0:
                    fs.sequence = ''.join(sequence_lines)
                    sequence.append(fs)
        # end if
    # end for
    return( sequence )


def dna_to_protein(sequence):

    # calculate all protein frames at once
    # using lists for efficiency
    frame = [ [],[],[] ]		
    
    # make sure the sequence is all upper case
    # otherwise, acid_to_protein[] will fail
    seq = sequence.upper()
    
    # slicing past the end of the array doesn't throw an error!
    for i in range(0,len(seq),3):
        
        frame[0].append( acid_to_protein.get(seq[i+0:i+3],'?') )		
        frame[1].append( acid_to_protein.get(seq[i+1:i+4],'?') )
        frame[2].append( acid_to_protein.get(seq[i+2:i+5],'?') )
        
    # collapse the lists to 3 strings
    protein = []
    protein.append( ''.join(frame[0]) )
    protein.append( ''.join(frame[1]) )
    protein.append( ''.join(frame[2]) )
        
    return(protein)

def is_dna(sequence):
	seq = sequence.upper()
	if seq.find('U') == -1:
		return(1)
	return(0)


def reverse_complement(sequence):
    # make sure the sequence is all upper case
    # otherwise, complement[] will fail
    seq = sequence.upper()
    
    # compute with a list since its faster
    c = []
    
    # store length since its used in multiple places
    l = len(seq)
    
    for i in range(l):
        c.append( complement.get(seq[l-i-1],'?') )
        
    # collapse the list to the return string
    reverse_comp = ''.join(c)
    
    return(reverse_comp)

######################################################################
#
# begin execution of main program
#
######################################################################

print "reading file"

sequence = read_fasta_file("ecoli-U00096.fna")

print "frame 1,2,3 proteins for", sequence[0].name

p = dna_to_protein(sequence[0].sequence)

print "complement of", sequence[0].name

c = reverse_complement(sequence[0].sequence)

q = dna_to_protein(c)

#print p[0],'\n'
#print p[1],'\n'
#print p[2],'\n'
#print q[0],'\n'
#print q[1],'\n'
#print q[2],'\n'
#!/usr/bin/python

