From 5f26038de800f475601daf4827479347c591b501 Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Tue, 21 Jan 2020 12:33:24 +0100 Subject: [PATCH] finally got the aligner running --- bitext_align.py | 88 ++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/bitext_align.py b/bitext_align.py index b3de7e4..5068a02 100644 --- a/bitext_align.py +++ b/bitext_align.py @@ -4,12 +4,13 @@ import os,sys import re import pandas as pd +import numpy as np from numpy import cumsum from pandas import DataFrame from nltk import word_tokenize, sent_tokenize -import xml.etree.ElementTree as ET +#import xml.etree.ElementTree as ET from jellyfish import levenshtein_distance as lev -import six +#import six from google.cloud import translate_v2 as translate from itertools import product as cp @@ -24,12 +25,12 @@ translate_client = translate.Client() def master_align(text0, text1, lang0, lang1): """ Takes two equivalent texts (original and trnslation) and returns aligned texts. """ - df0 = frame_from_text(text0, source=lang0, target=lang1) + df0 = frame_from_text(text0, lang0, lang1) print('A') - df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True) + df1 = frame_from_text(text1, lang1, lang0, is1=True) print('B') # returns dfs with ['sent', 'trans', 'rellen', 'relpos'] - anchors = anchors_from_frames(df0, df1, score_funct, score_threshold, window=2) + anchors = anchors_from_frames(df0, df1, window=2) print('C') alignments = intermediate_align(df0, df1, anchors, lookahead=4) print('D') @@ -38,9 +39,11 @@ def master_align(text0, text1, lang0, lang1): return textdict0, textdict1 -def frame_from_text(text, source='ru', target='en', is1=False): # +def frame_from_text(text, source, target, is1=False): # """ """ # + #print(source, '-->', target) cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] + #print(cols) frame = pd.DataFrame(columns=cols) frame[cols[0]] = sent_tokenize(text) frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) @@ -49,35 +52,41 @@ def frame_from_text(text, source='ru', target='en', is1=False): # cumul_b = list(np.cumsum(frame[cols[2]])) cumul_a = [0]+cumul_b[:-1] frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b))) + #print(frame[[cols[0], cols[1]]]) return frame def anchors_from_frames(frame0, frame1, window): # """ """ pairdf = generate_pairdf(frame0, frame1, window) - pairdf['lev0'] = pairdf[['sent0', 'trans1']].apply(lambda x: trdist(x.sent0, x.trans1)) - pairdf['lev1'] = pairdf[['sent1', 'trans0']].apply(lambda x: trdist(x.sent1, x.trans0)) + frame0['index0'] = frame0.index + frame1['index1'] = frame1.index + pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1') + pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1) + pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1) pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1) pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1) pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1) - pairdf['isanchor'] = pairdf.minlev<0.45 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3 - return pairdf[pairdf.isanchor][['index0','index1']] + pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3) + return list(pairdf[pairdf.isanchor][['index0','index1']].values) def intermediate_align(frame0, frame1, anchs, lookahead): # """ """ aligns = [] end0, end1 = frame0.shape[0], frame1.shape[0] - anchor_ranges = lis(zip([(-1,-1)]+anchs, anchs+[(end0, end1)])) + anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)])) for rang in anchor_ranges: interaligns = get_interalign(frame0, frame1, *rang, lookahead) - aligns.append(rang[0]) + a,b = rang[0] + aligns.append(((a,b),(a,b))) aligns.extend(interaligns) return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # """ """ + print(anchors_init, anchors_next) interaligns = [] i,j = anchors_init i+=1 @@ -85,19 +94,25 @@ def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # end0, end1 = anchors_next while i0] + len0 = frame0.shape[0] + len1 = frame1.shape[0] allpairs = [] for i,j in overlap: for k in range(-window, window+1): for l in range(-window, window+1): - allpairs.append() + allpairs.append((i+k,j+l)) + allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and aa1 and b0<=b1: - return b0-max(a0,a1) - elif a0>=a1 and a0=a0 and a1c and b<=d: + return b-max(a,c) + elif a>=c and a=a and c1 else x # -trdist = lambda x,y: lev(x,y)/max(x,y) # +trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) #