diff --git a/bitext_align.py b/bitext_align.py index 7641358..b3de7e4 100644 --- a/bitext_align.py +++ b/bitext_align.py @@ -22,21 +22,26 @@ translate_client = translate.Client() def master_align(text0, text1, lang0, lang1): - """ Takes two equivalent texts (original and trnslation) and returns - aligned texts. """ - df0 = frame_from_text(text0, source=lang0, target=lang1) - df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True) + """ Takes two equivalent texts (original and trnslation) and returns + aligned texts. """ + df0 = frame_from_text(text0, source=lang0, target=lang1) + print('A') + df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True) + print('B') # returns dfs with ['sent', 'trans', 'rellen', 'relpos'] anchors = anchors_from_frames(df0, df1, score_funct, score_threshold, window=2) + print('C') alignments = intermediate_align(df0, df1, anchors, lookahead=4) + print('D') textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments) + print('E') return textdict0, textdict1 -def frame_from_text(text, source='ru', target='en', is0=True): # +def frame_from_text(text, source='ru', target='en', is1=False): # """ """ # cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] - frame = pd.DataFrame(colmns=cols) + frame = pd.DataFrame(columns=cols) frame[cols[0]] = sent_tokenize(text) frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x)) @@ -44,7 +49,7 @@ def frame_from_text(text, source='ru', target='en', is0=True): # cumul_b = list(np.cumsum(frame[cols[2]])) cumul_a = [0]+cumul_b[:-1] frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b))) - return frame + return frame def anchors_from_frames(frame0, frame1, window): # @@ -55,8 +60,8 @@ def anchors_from_frames(frame0, frame1, window): # pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1) pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1) pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1) - pairdf['isanchor'] = pairdf.minlev<045 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3 - return pairdf[pairdf.isanchor][['index0','index1']] + pairdf['isanchor'] = pairdf.minlev<0.45 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3 + return pairdf[pairdf.isanchor][['index0','index1']] def intermediate_align(frame0, frame1, anchs, lookahead): # @@ -68,7 +73,7 @@ def intermediate_align(frame0, frame1, anchs, lookahead): # interaligns = get_interalign(frame0, frame1, *rang, lookahead) aligns.append(rang[0]) aligns.extend(interaligns) - return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] + return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # @@ -79,20 +84,20 @@ def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # j+=1 end0, end1 = anchors_next while i0] + overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0] allpairs = [] for i,j in overlap: - for k in range(-window:window+1): - for l in range(-window:window+1): + for k in range(-window, window+1): + for l in range(-window, window+1): allpairs.append() allpairs = sorted(list(set(allpairs))) pairdf[pairdf.columns] = pd.DataFrame(allpairs).values - return pairdf + return pairdf def get_overlap(a,b,c,d): - #print(a0,b0,a1,b1) + #print(a0,b0,a1,b1) if b0>a1 and b0<=b1: return b0-max(a0,a1) elif a0>=a1 and a0