import math
import cdi
## A317_rodriguez
'''
Construction of an optimal encoding and related questions
'''
# Characters and relative frequencies ('_' stands for ' ').
CF= [('a',575), ('b',128), ('c',263), ('d',285), ('e',913), ('f',173), 
    ('g',133), ('h',313), ('i',599), ('j',6),   ('k',84),  ('l',335), 
    ('m',235), ('n',596), ('o',689), ('p',192), ('q',8),   ('r',508), 
    ('s',567), ('t',706), ('u',334), ('v',69),  ('w',119), ('x',73), 
    ('y',164), ('z',7), ('_',1928)]

'''
1. Extract from CF the list F of relative frequencies and the list A of characters
'''
A,F = zip(*CF)
'''
2. Compute the list L of binary "Shannon lengths" corresponding to F.
   Use the formula E.3.2 on T3.10, which in terms of frequencies
   becomes ceil(log2(S)-log2(f_j)), S = sum(f_j).
   The function ceil is defined in the math module.
'''
S = sum(F)
L = [math.ceil(cdi.L2(S)-cdi.L2(fj)) for fj in F]
print ("Computed Shannon lengths L:")
print (L)
print ("------------")
'''
3. Consider the list H below. Check that H[j] <= F[j] for all j
   and yet that it satisfies kraft_sum(H) <= 1.

'''
H = [4,6,5,5,4,6,6,5,4,10,7,5,6,4,4,6,9,5,4,4,5,8,7,7,6,10,2]
# compute a list of all "error values" (if any)
check_Hj_le_Lj = [(x,y) for (x,y) in zip(H,L) if x>y]
if check_Hj_le_Lj:
  print ("The following pairs (H_j, L_j) don't hold the inequality H_j <= L_j")
  print (check_Hj_le_Lj)
else:
  print ("All pairs (H_j, L_j) satisfy H_j <= L_j")
print ("------------")
# compute and check kraft sum
if cdi.kraft(H, 2) > 1.0:
  print ("H does not satisfy the Kraft sum")
else:
  print ("H satisfies the Kraft sum")
print ("------------")

'''
4. Use the output C = ells2code(H) (see cdi) to construct a binary
   encoding E of A such that len(E(A[j]) = H[j]. Be very careful
   in how to relate the ordering of C to the orderings of A and H.
'''
# there is no guarantee of any kind of ordering in a dict
C = sorted(cdi.ells2code(H).items(), key = lambda x: x[0])
# sort pairs of (x, h_x) by h_x
AH = sorted(zip(A,H), key = lambda x : x[1])
# greedily assign shortest encodings to lower H values
E = dict([(x,y) for ((x,_),(_,y)) in zip(AH,C)])
print ("Encoding E: ")
print (E)
print ("------------")
AH = dict(AH)
# again, compute a list of "error values" (if any)
check_all_Ei_eq_Hi = [(x,y) for (x,y) in E.items() if len(y)!=AH[x]]
if check_all_Ei_eq_Hi:
  print ("The following pairs (Ai, Ci) don't satisfy len(Ci) = H[Ai]")
  print (check_all_Ei_eq_Hi)
else:
  print ("All characters have the same encoding length as indicated in H")

'''
5. Compute the average length l of the encoding E and compare
   it with the entropy e of the distribution F
'''
# convert frequencies into weights by mapping f(fi) = fi/total to F
F = [float(x)/float(S) for x in F]
print ("Entropy of distribution F:", cdi.entropy(F))
F = dict(zip(A,F))
# apply expect. formula E(X) = P(Xi)*i
print ("Average length of E:", sum([F[x]*len(y) for (x,y) in E.items()]))