minor changes

master
Isaac Riley 6 years ago
parent 5ae77714d5
commit c02fdd5e91

@ -25,18 +25,23 @@ def master_align(text0, text1, lang0, lang1):
""" Takes two equivalent texts (original and trnslation) and returns
aligned texts. """
df0 = frame_from_text(text0, source=lang0, target=lang1)
print('A')
df1 = frame_from_text(text1, source=lang1, target=lang0, is1=True)
print('B')
# returns dfs with ['sent', 'trans', 'rellen', 'relpos']
anchors = anchors_from_frames(df0, df1, score_funct, score_threshold, window=2)
print('C')
alignments = intermediate_align(df0, df1, anchors, lookahead=4)
print('D')
textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
print('E')
return textdict0, textdict1
def frame_from_text(text, source='ru', target='en', is0=True): #
def frame_from_text(text, source='ru', target='en', is1=False): #
""" """ #
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
frame = pd.DataFrame(colmns=cols)
frame = pd.DataFrame(columns=cols)
frame[cols[0]] = sent_tokenize(text)
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
@ -55,7 +60,7 @@ def anchors_from_frames(frame0, frame1, window): #
pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['isanchor'] = pairdf.minlev<045 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3
pairdf['isanchor'] = pairdf.minlev<0.45 & pairdf.maxlev<0.6 & pairdf.rellen_ratio<1.3
return pairdf[pairdf.isanchor][['index0','index1']]
@ -113,11 +118,11 @@ def generate_pairdf(frame0, frame1, window):
pairdf = pd.DataFrame(columns=['index0', 'index1'])
ranges0 = frame0.relpos0
ranges1 = frame1.relpos1
overlap = [i,j for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
allpairs = []
for i,j in overlap:
for k in range(-window:window+1):
for l in range(-window:window+1):
for k in range(-window, window+1):
for l in range(-window, window+1):
allpairs.append()
allpairs = sorted(list(set(allpairs)))
pairdf[pairdf.columns] = pd.DataFrame(allpairs).values

Loading…
Cancel
Save