From d97a4f7d57849139a7f01237c6e5b952213da2db Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Sat, 18 Jan 2020 19:07:03 +0100 Subject: [PATCH] alignment code so far (needs commenting and testing) --- bitext_align.py | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 bitext_align.py diff --git a/bitext_align.py b/bitext_align.py new file mode 100644 index 0000000..7641358 --- /dev/null +++ b/bitext_align.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + + +import os,sys +import re +import pandas as pd +from numpy import cumsum +from pandas import DataFrame +from nltk import word_tokenize, sent_tokenize +import xml.etree.ElementTree as ET +from jellyfish import levenshtein_distance as lev +import six +from google.cloud import translate_v2 as translate +from itertools import product as cp + +translate_client = translate.Client() + +''' + +''' + + + +def master_align(text0, text1, lang0, lang1): + """ Takes two equivalent texts (original and trnslation) and returns + aligned texts. """ + df0 = frame_from_text(text0, source=lang0, target=lang1) + df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True) + # returns dfs with ['sent', 'trans', 'rellen', 'relpos'] + anchors = anchors_from_frames(df0, df1, score_funct, score_threshold, window=2) + alignments = intermediate_align(df0, df1, anchors, lookahead=4) + textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments) + return textdict0, textdict1 + + +def frame_from_text(text, source='ru', target='en', is0=True): # + """ """ # + cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] + frame = pd.DataFrame(colmns=cols) + frame[cols[0]] = sent_tokenize(text) + frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) + frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x)) + frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max() + cumul_b = list(np.cumsum(frame[cols[2]])) + cumul_a = [0]+cumul_b[:-1] + frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b))) + return frame + + +def anchors_from_frames(frame0, frame1, window): # + """ """ + pairdf = generate_pairdf(frame0, frame1, window) + pairdf['lev0'] = pairdf[['sent0', 'trans1']].apply(lambda x: trdist(x.sent0, x.trans1)) + pairdf['lev1'] = pairdf[['sent1', 'trans0']].apply(lambda x: trdist(x.sent1, x.trans0)) + pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1) + pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1) + pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1) + pairdf['isanchor'] = pairdf.minlev<045 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3 + return pairdf[pairdf.isanchor][['index0','index1']] + + +def intermediate_align(frame0, frame1, anchs, lookahead): # + """ """ + aligns = [] + end0, end1 = frame0.shape[0], frame1.shape[0] + anchor_ranges = lis(zip([(-1,-1)]+anchs, anchs+[(end0, end1)])) + for rang in anchor_ranges: + interaligns = get_interalign(frame0, frame1, *rang, lookahead) + aligns.append(rang[0]) + aligns.extend(interaligns) + return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] + + +def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # + """ """ + interaligns = [] + i,j = anchors_init + i+=1 + j+=1 + end0, end1 = anchors_next + while i0] + allpairs = [] + for i,j in overlap: + for k in range(-window:window+1): + for l in range(-window:window+1): + allpairs.append() + allpairs = sorted(list(set(allpairs))) + pairdf[pairdf.columns] = pd.DataFrame(allpairs).values + return pairdf + + +def get_overlap(a,b,c,d): + #print(a0,b0,a1,b1) + if b0>a1 and b0<=b1: + return b0-max(a0,a1) + elif a0>=a1 and a0=a0 and a11 else x # +trdist = lambda x,y: lev(x,y)/max(x,y) # + +