Merge remote-tracking branch 'origin/master'

master
Pavan Mandava 6 years ago
commit 4217754b84

@ -4,12 +4,13 @@
import os,sys import os,sys
import re import re
import pandas as pd import pandas as pd
import numpy as np
from numpy import cumsum from numpy import cumsum
from pandas import DataFrame from pandas import DataFrame
from nltk import word_tokenize, sent_tokenize from nltk import word_tokenize, sent_tokenize
import xml.etree.ElementTree as ET #import xml.etree.ElementTree as ET
from jellyfish import levenshtein_distance as lev from jellyfish import levenshtein_distance as lev
import six #import six
from google.cloud import translate_v2 as translate from google.cloud import translate_v2 as translate
from itertools import product as cp from itertools import product as cp
@ -24,12 +25,12 @@ translate_client = translate.Client()
def master_align(text0, text1, lang0, lang1): def master_align(text0, text1, lang0, lang1):
""" Takes two equivalent texts (original and trnslation) and returns """ Takes two equivalent texts (original and trnslation) and returns
aligned texts. """ aligned texts. """
df0 = frame_from_text(text0, source=lang0, target=lang1) df0 = frame_from_text(text0, lang0, lang1)
print('A') print('A')
df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True) df1 = frame_from_text(text1, lang1, lang0, is1=True)
print('B') print('B')
# returns dfs with ['sent', 'trans', 'rellen', 'relpos'] # returns dfs with ['sent', 'trans', 'rellen', 'relpos']
anchors = anchors_from_frames(df0, df1, score_funct, score_threshold, window=2) anchors = anchors_from_frames(df0, df1, window=2)
print('C') print('C')
alignments = intermediate_align(df0, df1, anchors, lookahead=4) alignments = intermediate_align(df0, df1, anchors, lookahead=4)
print('D') print('D')
@ -38,9 +39,11 @@ def master_align(text0, text1, lang0, lang1):
return textdict0, textdict1 return textdict0, textdict1
def frame_from_text(text, source='ru', target='en', is1=False): # def frame_from_text(text, source, target, is1=False): #
""" """ # """ """ #
#print(source, '-->', target)
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
#print(cols)
frame = pd.DataFrame(columns=cols) frame = pd.DataFrame(columns=cols)
frame[cols[0]] = sent_tokenize(text) frame[cols[0]] = sent_tokenize(text)
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
@ -49,35 +52,41 @@ def frame_from_text(text, source='ru', target='en', is1=False): #
cumul_b = list(np.cumsum(frame[cols[2]])) cumul_b = list(np.cumsum(frame[cols[2]]))
cumul_a = [0]+cumul_b[:-1] cumul_a = [0]+cumul_b[:-1]
frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b))) frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b)))
#print(frame[[cols[0], cols[1]]])
return frame return frame
def anchors_from_frames(frame0, frame1, window): # def anchors_from_frames(frame0, frame1, window): #
""" """ """ """
pairdf = generate_pairdf(frame0, frame1, window) pairdf = generate_pairdf(frame0, frame1, window)
pairdf['lev0'] = pairdf[['sent0', 'trans1']].apply(lambda x: trdist(x.sent0, x.trans1)) frame0['index0'] = frame0.index
pairdf['lev1'] = pairdf[['sent1', 'trans0']].apply(lambda x: trdist(x.sent1, x.trans0)) frame1['index1'] = frame1.index
pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1')
pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1)
pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1)
pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1) pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1) pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1) pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['isanchor'] = pairdf.minlev<0.45 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3 pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3)
return pairdf[pairdf.isanchor][['index0','index1']] return list(pairdf[pairdf.isanchor][['index0','index1']].values)
def intermediate_align(frame0, frame1, anchs, lookahead): # def intermediate_align(frame0, frame1, anchs, lookahead): #
""" """ """ """
aligns = [] aligns = []
end0, end1 = frame0.shape[0], frame1.shape[0] end0, end1 = frame0.shape[0], frame1.shape[0]
anchor_ranges = lis(zip([(-1,-1)]+anchs, anchs+[(end0, end1)])) anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)]))
for rang in anchor_ranges: for rang in anchor_ranges:
interaligns = get_interalign(frame0, frame1, *rang, lookahead) interaligns = get_interalign(frame0, frame1, *rang, lookahead)
aligns.append(rang[0]) a,b = rang[0]
aligns.append(((a,b),(a,b)))
aligns.extend(interaligns) aligns.extend(interaligns)
return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] return aligns[1:] # format [((i_start, i_end),(j_start, j_end))]
def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
""" """ """ """
print(anchors_init, anchors_next)
interaligns = [] interaligns = []
i,j = anchors_init i,j = anchors_init
i+=1 i+=1
@ -85,19 +94,25 @@ def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
end0, end1 = anchors_next end0, end1 = anchors_next
while i<end0 and j<end1: while i<end0 and j<end1:
room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead) room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
best_alignment = min([(x,y) for x,y in cp(range(i,i+room0),range(j+room1)) if x==i or y==j], key=score(df0, df1, start0, start1, end0, end1)) lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
interaligns.append((best_alignment)) i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
print((i,j), (i_,j_))
interaligns.append(((i,j),(i_,j_)))
i,j = i_+1,j_+1
return interaligns return interaligns
def score(frame0, frame1, start0, start1, end0, end1): # def score(frame0, frame1, start0, start1, end0, end1): #
s0 = ' '.join(frame0.loc[start0:end0+1, 'sent0']) #print(frame0.columns)
s1 = ' '.join(frame0.loc[start1:end1+1, 'sent1']) #print(frame1.columns)
t0 = ' '.join(frame0.loc[start0:end0+1, 'trans0']) s0 = ' '.join(frame0.loc[start0:end0, 'sent0'])
t1 = ' '.join(frame0.loc[start1:end1+1, 'trans1']) s1 = ' '.join(frame1.loc[start1:end1, 'sent1'])
l0 = sum(frame0.loc[start0:end0+1, 'rellen0']) t0 = ' '.join(frame0.loc[start0:end0, 'trans0'])
l1 = sum(frame1.loc[start1:end1+1, 'rellen1']) t1 = ' '.join(frame1.loc[start1:end1, 'trans1'])
return (lev(s0,t1)+lev(s1,t0))*gr1(l0/l1)/2 l0 = sum(frame0.loc[start0:end0, 'rellen0'])
l1 = sum(frame1.loc[start1:end1, 'rellen1'])
#print(s0, s1, t0, t1, l0, l1)
return (trdist(s0,t1)+trdist(s1,t0))*gr1(l0/l1)/2
@ -105,9 +120,13 @@ def score(frame0, frame1, start0, start1, end0, end1): #
def textdicts_from_alignments(frame0, frame1, aligns): # def textdicts_from_alignments(frame0, frame1, aligns): #
""" """ """ """
textdict0, textdict1 = {},{} textdict0, textdict1 = {},{}
for i,((a0,b0),(a1,b1)) in enumerate(aligns): for i,((a0,a1),(b0,b1)) in enumerate(aligns):
t0 = ' '.join(frame0.loc[a0:b0+1, 'sent0']) t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
t1 = ' '.join(frame0.loc[a1:b1+1, 'sent0']) t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
print('***************************')
print(aligns[i])
print(t0)
print(t1)
textdict0.update({i:t0}) textdict0.update({i:t0})
textdict1.update({i:t1}) textdict1.update({i:t1})
return textdict0, textdict1 return textdict0, textdict1
@ -119,30 +138,33 @@ def generate_pairdf(frame0, frame1, window):
ranges0 = frame0.relpos0 ranges0 = frame0.relpos0
ranges1 = frame1.relpos1 ranges1 = frame1.relpos1
overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0] overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
len0 = frame0.shape[0]
len1 = frame1.shape[0]
allpairs = [] allpairs = []
for i,j in overlap: for i,j in overlap:
for k in range(-window, window+1): for k in range(-window, window+1):
for l in range(-window, window+1): for l in range(-window, window+1):
allpairs.append() allpairs.append((i+k,j+l))
allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and a<len0 and b<len1]
allpairs = sorted(list(set(allpairs))) allpairs = sorted(list(set(allpairs)))
pairdf[pairdf.columns] = pd.DataFrame(allpairs).values pairdf = pd.DataFrame(allpairs).rename(columns={0:'index0', 1:'index1'})
return pairdf return pairdf
def get_overlap(a,b,c,d): def get_overlap(a,b,c,d):
#print(a0,b0,a1,b1) #print(a0,b0,a1,b1)
if b0>a1 and b0<=b1: if b>c and b<=d:
return b0-max(a0,a1) return b-max(a,c)
elif a0>=a1 and a0<b1: elif a>=c and a<d:
return min(b0,b1)-a0 return min(b,d)-a
elif a1>=a0 and a1<b0: elif c>=a and c<b:
return b1-max(a0,a1) return d-max(a,c)
else: else:
return 0 return 0
gr1 = lambda x: 1/less1(x) # gr1 = lambda x: 1/less1(x) #
less1 = lambda x: 1/x if abs(x)>1 else x # less1 = lambda x: 1/x if abs(x)>1 else x #
trdist = lambda x,y: lev(x,y)/max(x,y) # trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) #

Loading…
Cancel
Save