## A317
## David Martín Alaminos

from math import ceil, log2
from cdi import kraft, ells2code, entropy


'''
Construction of an optimal encoding and related questions
'''

#Characters and relative frequencies ('_' stands for ' ').

CF= [('a',575), ('b',128), ('c',263), ('d',285), ('e',913), ('f',173),
    ('g',133), ('h',313), ('i',599), ('j',6),   ('k',84),  ('l',335),
    ('m',235), ('n',596), ('o',689), ('p',192), ('q',8),   ('r',508),
    ('s',567), ('t',706), ('u',334), ('v',69),  ('w',119), ('x',73),
    ('y',164), ('z',7), ('_',1928)]


'''
1.
Extract from CF the list F of relative frequencies and the list A of characters
'''

F = [f for (_,f) in CF]
A = [c for (c,_) in CF]



'''
2.
Compute the list L of binary "Shannon lengths" corresponding to F.
Use the formula E.3.2 on T3.10, which in terms of frequencies
becomes ceil(log2(S)-log2(f_j)), S = sum(f_j).
The function ceil is defined in the math module.
'''

S = sum(F)
L = [ceil(log2(S)-log2(f)) for f in F]
print("Shannon lengths:")
print(L)



'''
3.
Consider the list H below. Check that H[j] <= L[j] for
all j and yet that it satisfies kraft_sum(H) <= 1.
'''

H = [4,6,5,5,4,6,6,5,4,10,7,5,6,4,4,6,9,5,4,4,5,8,7,7,6,10,2]
assert(all([h <= l for (h,l) in zip(H,L)])) #if any H[j] > L[j], AssertionError is raised
assert(kraft(L) == True) #same for Kraft's inequality



'''
4.
Use the output C = ells2code(H) (see cdi) to construct a binary
encoding E of A such that len(E(A[j]) = H[j]. Be very careful
in how to relate the ordering of C to the orderings of A and H.
'''

C = ells2code(H)
C_sorted = [c for (c,_) in sorted(CF, key=lambda t: t[1], reverse=True)] #chars sorted by desc. frequency
X = list(C.values()) #binary codes already sorted by asc. length

E = list(zip(C_sorted,X)) #shortest codes are assigned to most frequent chars, and so on

c2x = dict(E)
assert(all([len(c2x[c]) == h for (c,h) in zip(A,H)]))

print()
print("Assembled encoding:")
print(E)



'''
5.
Compute the average length l of the encoding E and compare
it with the entropy e of the distribution F
'''

def avg_encoded_word_length(enc, freqs):
    code_length = {c:len(x) for (c,x) in enc}
    weighted_sum = 0
    freq_amount = 0

    for (c,f) in freqs:
        weighted_sum += f*code_length[c]
        freq_amount += f

    return weighted_sum/freq_amount


avg = avg_encoded_word_length(E,CF)
entr = entropy(F)
assert(avg >= entr)

print()
print("Average length of encoded words:", round(avg,4))
print("Entropy of the frequency distribution:", round(entr,4))