## A317_x
'''
Construction of an optimal encoding and related questions
'''

# Characters and relative frequencies ('_' stands for ' ').

CF= [('a',575), ('b',128), ('c',263), ('d',285), ('e',913), ('f',173), 
    ('g',133), ('h',313), ('i',599), ('j',6),   ('k',84),  ('l',335), 
    ('m',235), ('n',596), ('o',689), ('p',192), ('q',8),   ('r',508), 
    ('s',567), ('t',706), ('u',334), ('v',69),  ('w',119), ('x',73), 
    ('y',164), ('z',7), ('_',1928)]  

'''
1. Extract the list F of relative frequencies and the list A of characters
'''
F = [f for _,f in CF] 
A = [x for x,_ in CF] 

'''
2. Compute the list L of binary "Shannon lengths" corresponding to F.
   Use the formula E.3.2 on T3.10, which in terms of frequencies
   becomes ceil(log2(S)-log2(f_j)), S = sum(f_j).
   The function ceil is defined in the math module.
'''
from cdi import *
from math import ceil
def shannon_frequencies(F):
    S = sum(F)
    L = L2(S)
    return[ceil(L-L2(f)) for f in F]
L = shannon_frequencies(F)

'''
3. Consider the list H below. Check that H[j] <= F[j] for all j
   and yet that it satisfies kraft_sum(H) <= 1.

'''
H = [4,6,5,5,4,6,6,5,4,10,7,5,6,4,4,6,9,5,4,4,5,8,7,7,6,10,2]
print(" H <= L?",all( h <= f for (h,f) in zip(H,L)))
print('kraft(H) =', kraft(H))
print()

'''
4. Use the output C = ells2code(H) (see cdi) to construct a binary
   encoding E of A such that len(E(A[j]) = H[j]. Be very careful
   in how to relate the ordering of C to the orderings of A and H.
'''
C = ells2code(H)
P = zip(A,H)
P = sorted(P,key =lambda x: x[1]) # reorder P with H non-decreasing
E ={p[0]:C[j] for (j,p) in zip(C,P)}

'''
5. Compute the average length l of the encoding E and compare
   it with the entropy e of the distribution F
'''
CF = dict(CF)
a = sum(CF[x]*len(E[x]) for x in A)/sum(F)
print("Average length: ",a)
print()

# a check
b = sum(f*l for (f,l) in zip(F,H))/sum(F)
print("A check: a = b? ", a == b)
print()

e = entropy(F)

print("a-e =",a-e)