Commit a8710829 by xushangqing

xsq modified

parent e26b16c7
venv
.env
data/train_dataset/cpp-cuda-*
*.err
*.out
*.log
*.tmp
*log
*.patch
*.tok
*.bpe
trained/*
\ No newline at end of file
{
"python.pythonPath": "/lustre/S/xushangqing/anaconda3/bin/python"
}
\ No newline at end of file
......@@ -3,6 +3,12 @@
Pytorch original implementation of TransCoder in [Unsupervised Translation of Programming Languages](https://arxiv.org/pdf/2006.03511.pdf)
![Model](https://dl.fbaipublicfiles.com/transcoder/TransCoder_Schema.jpg)
## XSQ Comment
Data preprocess script which works on original data from BigQuery is in ./selfmade/filter.py
for network failure in training process, move multi-bleu.perl(modified in the bleu calculation part) into XLM/src/evaluation/
## Dependencies
- Python 3
- [NumPy](http://www.numpy.org/)
......
......@@ -226,6 +226,8 @@ class Dataset(object):
if tokens_per_batch == -1:
batches = np.array_split(indices, math.ceil(
len(indices) * 1. / self.batch_size))
# print(batches)
# exit()
else:
batch_ids = np.cumsum(lengths[indices]) // tokens_per_batch
_, bounds = np.unique(batch_ids, return_index=True)
......@@ -292,7 +294,11 @@ class ParallelDataset(Dataset):
sentences_WITH_IDS = sentences
sentences = []
ids_ = []
# print(sentences_WITH_IDS)
# print("sep_index = ", self.sep_index)
#exit()
for s in sentences_WITH_IDS:
# print(np.where(s == self.sep_index))
pos = np.where(s == self.sep_index)[0][0]
sentences.append(s[pos + 1:])
ids_.append(s[:pos])
......@@ -332,6 +338,8 @@ class ParallelDataset(Dataset):
"""
eos = self.eos_index
# check number of sentences
# print("pos1 = ", self.pos1)
# print("pos2 = ", self.pos2)
assert len(self.pos1) == len(self.pos2) > 0
# check sentences indices
assert len(self.pos1) == (self.sent1[self.pos1[:, 1]] == eos).sum()
......@@ -418,6 +426,12 @@ class ParallelDataset(Dataset):
sentence_ids = sentence_ids[:self.max_batch_size]
pos1 = self.pos1[sentence_ids]
pos2 = self.pos2[sentence_ids]
#print(pos1)
#print(pos2)
#print(self.sent1)
#print(len(self.sent1))
#print(self.sent2)
sent1 = self.batch_sentences([self.sent1[a:b] for a, b in pos1])
sent2 = self.batch_sentences([self.sent2[a:b] for a, b in pos2])
yield (sent1, sent2, sentence_ids) if return_indices else (sent1, sent2)
......@@ -448,6 +462,8 @@ class ParallelDataset(Dataset):
if tokens_per_batch == -1:
batches = np.array_split(indices, math.ceil(
len(indices) * 1. / self.batch_size))
# print("tokens_per_batch, batches = " , batches)
# exit()
else:
batch_ids = np.cumsum(lengths[indices]) // tokens_per_batch
_, bounds = np.unique(batch_ids, return_index=True)
......@@ -455,6 +471,8 @@ class ParallelDataset(Dataset):
for i in range(len(bounds) - 1)]
if bounds[-1] < len(indices):
batches.append(indices[bounds[-1]:])
# print("tokens_per_batch, batches = " , batches)
# exit()
# optionally shuffle batches
if shuffle:
......
......@@ -172,11 +172,11 @@ class Dictionary(object):
"""
Index sentences with a dictionary.
"""
if bin_path is not None and os.path.isfile(bin_path):
print("Loading data from %s ..." % bin_path)
data = torch.load(bin_path)
assert dico == data['dico']
return data
# if bin_path is not None and os.path.isfile(bin_path):
# print("Loading data from %s ..." % bin_path)
# data = torch.load(bin_path)
# assert dico == data['dico']
# return data
positions = []
sentences = []
......@@ -231,4 +231,5 @@ class Dictionary(object):
print("Saving the data to %s ..." % bin_path)
torch.save(data, bin_path, pickle_protocol=4)
return data
......@@ -63,16 +63,22 @@ def load_binarized(path, params):
assert path.endswith('.pth')
if params.debug_train:
path = path.replace('train', 'valid')
if getattr(params, 'multi_gpu', False):
# print("## here ## {}".format(getattr(params, 'multi_gpu', False)))
if not getattr(params, 'multi_gpu', False):
# print("here")
assert params.split_data_accross_gpu in ['local', 'global']
# print(" ## here ## %s %i %i" % path, params.local_rank, params.global_rank )
if params.split_data_accross_gpu == 'local':
split_path = '%s.%i.pth' % (path[:-4], params.local_rank)
else:
split_path = '%s.%i.pth' % (path[:-4], params.global_rank)
print(" ## here ## %s %i %i" % (split_path, params.local_rank, params.global_rank) )
if os.path.isfile(split_path):
assert params.split_data is False
path = split_path
else :
print("warning : split path not available {}".format(split_path))
assert os.path.isfile(path), path
logger.info("Loading data from %s ..." % path)
data = torch.load(path)
......@@ -89,6 +95,7 @@ def set_dico_parameters(params, data, dico):
else:
data['dico'] = dico
n_words = len(dico)
print("n_words = ", n_words)
bos_index = dico.index(BOS_WORD)
eos_index = dico.index(EOS_WORD)
pad_index = dico.index(PAD_WORD)
......@@ -205,6 +212,11 @@ def load_para_data(params, data):
tgt_data = load_binarized(tgt_path, params)
# update dictionary parameters
# print("load_parallel_data")
# print(params)
# print(data)
# print(src_data['dico'])
# exit()
set_dico_parameters(params, data, src_data['dico'])
set_dico_parameters(params, data, tgt_data['dico'])
......@@ -310,6 +322,8 @@ def check_data_params(params):
if not os.path.isfile(p):
logger.error(f"{p} not found")
if not params.eval_only:
for paths in params.mono_dataset.values():
print("mono_path = ", paths)
assert all([all([os.path.isfile(p) or os.path.isfile(p.replace('pth', '0.pth'))
for p in paths.values()]) for paths in params.mono_dataset.values()])
......@@ -320,8 +334,10 @@ def check_data_params(params):
[(l2, l3) for _, l2, l3 in params.bt_steps])
params.para_dataset = {
(src, tgt): {
splt: (os.path.join(params.data_path, '%s.%s-%s.%s.pth' % (splt, src, tgt, src)),
os.path.join(params.data_path, '%s.%s-%s.%s.pth' % (splt, src, tgt, tgt)))
splt: (os.path.join(params.data_path, '%s.%s-%s.%s.pth' % (splt,src, tgt, src)),
os.path.join(params.data_path, '%s.%s-%s.%s.pth' % (splt,src, tgt, tgt)))
# splt: (os.path.join(params.data_path, '%s.%s.pth' % (splt, src)),
# os.path.join(params.data_path, '%s.%s.pth' % (splt, tgt)))
for splt in ['train', 'valid', 'test']
if splt != 'train' or (src, tgt) in required_para_train or (tgt, src) in required_para_train
} for src in params.langs for tgt in params.langs
......
......@@ -447,9 +447,9 @@ class EncDecEvaluator(Evaluator):
langs=langs1, causal=False)
enc1 = enc1.transpose(0, 1)
enc1 = enc1.half() if params.fp16 else enc1
if max(len2) > 1024:
print('remove one long sentence')
continue
# if max(len2) > 1024:
# logger.info('remove one long sentence')
# continue
# decode target sentence
dec2 = decoder('fwd', x=x2, lengths=len2, langs=langs2,
causal=True, src_enc=enc1, src_len=len1)
......@@ -480,8 +480,11 @@ class EncDecEvaluator(Evaluator):
lengths, _ = lengths.reshape(-1,
params.number_samples).max(dim=1)
else:
# logger.info(" fuck 3")
generated, lengths = decoder.generate(
enc1, len1, lang2_id, max_len=len_v)
# test for
# exit()
# print(f'path 1: {generated.shape}')
else:
......@@ -493,7 +496,9 @@ class EncDecEvaluator(Evaluator):
max_len=len_v
)
# print(f'path 2: {generated.shape}')
hypothesis.extend(convert_to_text(
# exit()
logger.info(" fuck dump hyp txt")
hypothesis.extend(convert_to_text_for_hyp(
generated, lengths, self.dico, params, generate_several_reps=True))
# compute perplexity and prediction accuracy
......@@ -608,6 +613,48 @@ def convert_to_text(batch, lengths, dico, params, generate_several_reps=False):
else:
return [s[0] for s in sentences]
def convert_to_text_for_hyp(batch, lengths, dico, params, generate_several_reps=False):
"""
Convert a batch of sentences to a list of text sentences.
"""
batch = batch.cpu().numpy()
lengths = lengths.cpu().numpy()
assert len(batch.shape) == 2 or len(
batch.shape) == 3, f'generated batch shape was {batch.shape} while it should be in dimension 2 or 3'
nb_repetitions = 1
if len(batch.shape) == 2:
slen, bs = batch.shape
assert (batch[0] == params.eos_index).sum() == bs
assert (batch == params.eos_index).sum() == 2 * bs
else:
slen, nb_repetitions, bs = batch.shape
assert (batch == params.eos_index).sum() == 2 * bs * nb_repetitions
assert (batch[0] == params.eos_index).sum() == bs * nb_repetitions, print(
f"The values were {(batch[0] == params.eos_index).sum()} and {bs * nb_repetitions}")
assert lengths.max() == slen and lengths.shape[0] == bs, print(
lengths.max(), slen, lengths.shape[0], bs)
sentences = []
for j in range(bs):
sentences.append([])
for rep in range(nb_repetitions):
words = []
for k in range(1, lengths[j]):
next_element = batch[k, j] if len(
batch.shape) == 2 else batch[k, rep, j]
# print("element = ", next_element)
if next_element == params.eos_index:
# print("word unk ")
break
words.append(dico[next_element])
# print("words = ", dico[next_element])
sentences[j].append(" ".join(words))
if generate_several_reps:
return sentences
else:
return [s[0] for s in sentences]
def eval_moses_bleu(ref, hyp):
"""
......@@ -625,4 +672,5 @@ def eval_moses_bleu(ref, hyp):
return float(result[7:result.index(',')])
else:
logger.warning('Impossible to parse BLEU score! "%s"' % result)
return -1
return 0
# return -1
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
my $lowercase = 0;
if ($ARGV[0] eq "-lc") {
$lowercase = 1;
shift;
}
my $stem = $ARGV[0];
if (!defined $stem) {
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
exit(1);
}
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
my @REF;
my $ref=0;
while(-e "$stem$ref") {
&add_to_ref("$stem$ref",\@REF);
$ref++;
}
&add_to_ref($stem,\@REF) if -e $stem;
die("ERROR: could not find reference file $stem") unless scalar @REF;
# add additional references explicitly specified on the command line
shift;
foreach my $stem (@ARGV) {
&add_to_ref($stem,\@REF) if -e $stem;
}
sub add_to_ref {
my ($file,$REF) = @_;
my $s=0;
if ($file =~ /.gz$/) {
open(REF,"gzip -dc $file|") or die "Can't read $file";
} else {
open(REF,$file) or die "Can't read $file";
}
while(<REF>) {
chop;
push @{$$REF[$s++]}, $_;
}
close(REF);
}
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
my $s=0;
while(<STDIN>) {
chop;
$_ = lc if $lowercase;
my @WORD = split;
my %REF_NGRAM = ();
my $length_translation_this_sentence = scalar(@WORD);
my ($closest_diff,$closest_length) = (9999,9999);
foreach my $reference (@{$REF[$s]}) {
# print "$s $_ <=> $reference\n";
$reference = lc($reference) if $lowercase;
my @WORD = split(' ',$reference);
my $length = scalar(@WORD);
my $diff = abs($length_translation_this_sentence-$length);
if ($diff < $closest_diff) {
$closest_diff = $diff;
$closest_length = $length;
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
} elsif ($diff == $closest_diff) {
$closest_length = $length if $length < $closest_length;
# from two references with the same closeness to me
# take the *shorter* into account, not the "first" one.
}
for(my $n=1;$n<=4;$n++) {
my %REF_NGRAM_N = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$REF_NGRAM_N{$ngram}++;
}
foreach my $ngram (keys %REF_NGRAM_N) {
if (!defined($REF_NGRAM{$ngram}) ||
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
}
}
}
}
$length_translation += $length_translation_this_sentence;
$length_reference += $closest_length;
for(my $n=1;$n<=4;$n++) {
my %T_NGRAM = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$T_NGRAM{$ngram}++;
}
foreach my $ngram (keys %T_NGRAM) {
$ngram =~ /^(\d+) /;
my $n = $1;
# my $corr = 0;
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
$TOTAL[$n] += $T_NGRAM{$ngram};
if (defined($REF_NGRAM{$ngram})) {
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
$CORRECT[$n] += $T_NGRAM{$ngram};
# $corr = $T_NGRAM{$ngram};
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
}
else {
$CORRECT[$n] += $REF_NGRAM{$ngram};
# $corr = $REF_NGRAM{$ngram};
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
}
}
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
}
}
$s++;
}
my $brevity_penalty = 1;
my $bleu = 0;
my @bleu=();
for(my $n=1;$n<=4;$n++) {
if (defined ($TOTAL[$n])){
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
}else{
$bleu[$n]=0;
}
}
if ($length_reference==0){
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
exit(1);
}
if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
}
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
my_log( $bleu[2] ) +
my_log( $bleu[3] ) +
my_log( $bleu[4] ) ) / 4) ;
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
100*$bleu[1],
100*$bleu[1],
100*$bleu[2],
100*$bleu[3],
100*$bleu[4],
$brevity_penalty,
$length_translation / $length_reference,
$length_translation,
$length_reference;
# print STDERR "It is in-advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
sub my_log {
return -9999999999 unless $_[0];
return log($_[0]);
}
......@@ -79,6 +79,10 @@ def check_model_params(params):
else:
s = params.reload_model.split(',')
assert len(s) == 2
print(s)
for x in s:
print(x)
print(os.path.isfile(x))
assert all([x == '' or os.path.isfile(x) for x in s])
assert not (params.beam_size > 1 and params.number_samples >
......@@ -181,6 +185,8 @@ def build_model(params, dico):
enc_path, map_location=lambda storage, loc: storage.cuda(params.local_rank))
enc_reload = enc_reload['model' if 'model' in enc_reload else 'encoder']
if all([k.startswith('module.') for k in enc_reload.keys()]):
for k,v in enc_reload.items():
print("name = ", k)
enc_reload = {k[len('module.'):]: v for k,
v in enc_reload.items()}
......
......@@ -40,6 +40,9 @@ logger = getLogger()
def Embedding(num_embeddings, embedding_dim, padding_idx=None):
print("fuck,", num_embeddings)
print("fuck,", embedding_dim)
print("fuck,", padding_idx)
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
if padding_idx is not None:
......@@ -459,6 +462,11 @@ class TransformerModel(nn.Module):
- (lang_id1, lang_id2) if two languages are involved (MT)
"""
saved_args = locals()
# print("saved_args = ", saved_args)
# print("src_enc.shape: ", src_enc.shape)
# print("src_len.shape: ", src_len.shape)
if isinstance(max_len, int):
max_lengths = src_len.clone().fill_(max_len)
global_max_len = max_len
......@@ -472,6 +480,7 @@ class TransformerModel(nn.Module):
# generated sentences
generated = src_len.new(global_max_len, bs) # upcoming output
# print("generated = ", generated.shape)
generated.fill_(self.pad_index) # fill upcoming ouput with <PAD>
# we use <EOS> for <BOS> everywhere
generated[0].fill_(self.eos_index)
......@@ -480,19 +489,28 @@ class TransformerModel(nn.Module):
positions = src_len.new(global_max_len).long()
positions = torch.arange(global_max_len, out=positions).unsqueeze(
1).expand(global_max_len, bs)
# print("positions = ", positions)
# print("positions = ", positions.shape)
# language IDs
langs = src_len.new(global_max_len).long().fill_(tgt_lang_id)
langs = langs.unsqueeze(1).expand(global_max_len, bs)
# print("langs = ", langs)
# print("langs = ", langs.shape)
# current position / max lengths / length of generated sentences / unfinished sentences
cur_len = 1
gen_len = src_len.clone().fill_(1)
unfinished_sents = src_len.clone().fill_(1)
# print("gen_len = ", gen_len)
# print("unfinished_sents = ", unfinished_sents)
# cache compute states
self.cache = {'slen': 0}
previous_unfinished_mask = unfinished_sents.ne(0)
# print("previous_unfinished_mask = ", previous_unfinished_mask)
# logger.info("cur_len = ", cur_len)
# logger.info("global_max_len = ", global_max_len)
while cur_len < global_max_len:
# compute word scores
unfinished_mask = unfinished_sents.ne(0)
......@@ -521,7 +539,11 @@ class TransformerModel(nn.Module):
assert tensor.size() == (1, unfinished_mask.sum().item(), self.dim), (cur_len,
global_max_len, src_enc.size(), tensor.size(), (1, bs, self.dim))
tensor = tensor.data[-1, :, :].type_as(src_enc) # (bs, dim)
# print("tensor = ", tensor)
# print("tensor = ", tensor.shape)
scores = self.pred_layer.get_scores(tensor) # (bs, n_words)
# print("scores = ", scores)
# print("scores = ", scores.shape)
# select next words: sample or greedy
if sample_temperature is None:
......@@ -530,6 +552,8 @@ class TransformerModel(nn.Module):
next_words = torch.multinomial(
F.softmax(scores.float() / sample_temperature, dim=1), 1).squeeze(1)
assert next_words.size() == (unfinished_mask.sum().item(),)
# print("next_words = ", next_words)
# print("next_words = ", next_words.shape)
# update generations / lengths / finished sentences / current length.
# No need to updates the finished sequences since the value is self.pad_index by default
......@@ -546,6 +570,7 @@ class TransformerModel(nn.Module):
previous_unfinished_mask = unfinished_mask
# stop when there is a </s> in each sentence, or if we exceed the maximal length
if unfinished_sents.max() == 0:
logger.info("fuking break")
break
# sanity check
......
......@@ -54,7 +54,8 @@ def init_distributed_mode(params):
- global_rank
- world_size
"""
params.is_slurm_job = 'SLURM_JOB_ID' in os.environ and not params.debug_slurm
# params.is_slurm_job = 'SLURM_JOB_ID' in os.environ and not params.debug_slurm
params.is_slurm_job = False
print("SLURM job: %s" % str(params.is_slurm_job))
# SLURM job
......
......@@ -359,7 +359,7 @@ class Trainer(object):
x2 = x.clone()
for i in range(l.size(0)):
# generate a random permutation
scores = np.arange(l[i] - 1) + noise[:l[i] - 1, i]
scores = np.arange(int(l[i] - 1)) + noise[:l[i] - 1, i]
permutation = scores.argsort()
# shuffle words
x2[:l[i] - 1, i].copy_(x2[:l[i] - 1, i]
......
......@@ -482,6 +482,7 @@ def set_sampling_probs(data, params):
# monolingual data
params.mono_list = [
k for k, v in data['mono_stream'].items() if 'train' in v]
print("params.mono_list = ", params.mono_list)
if len(params.mono_list) > 0:
probs = np.array([1.0 * len(data['mono_stream'][lang]['train'])
for lang in params.mono_list])
......@@ -492,6 +493,7 @@ def set_sampling_probs(data, params):
# parallel data
params.para_list = [k for k, v in data['para'].items() if 'train' in v]
print("params.para_list = ", params.para_list)
if len(params.para_list) > 0:
probs = np.array([1.0 * len(data['para'][(l1, l2)]['train'])
for (l1, l2) in params.para_list])
......
fastBPE @ 036711f8
Subproject commit 036711f8fdc3265d64e8e123a0761be12c5a8e74
......@@ -101,7 +101,7 @@ def get_parser():
help="Maximum vocabulary size (-1 to disable)")
parser.add_argument("--min_count", type=int, default=0,
help="Minimum vocabulary count")
parser.add_argument("--lg_sampling_factor", type=float, default=-1,
parser.add_argument("--lg_sampling_factor", type=float, default=0.1,
help="Language sampling factor")
parser.add_argument("--has_sentences_ids", type=bool_flag, default=False,
help="Parallel sentences has an id or not in parallel datasets.")
......
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
my $lowercase = 0;
if ($ARGV[0] eq "-lc") {
$lowercase = 1;
shift;
}
my $stem = $ARGV[0];
if (!defined $stem) {
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
exit(1);
}
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
my @REF;
my $ref=0;
while(-e "$stem$ref") {
&add_to_ref("$stem$ref",\@REF);
$ref++;
}
&add_to_ref($stem,\@REF) if -e $stem;
die("ERROR: could not find reference file $stem") unless scalar @REF;
# add additional references explicitly specified on the command line
shift;
foreach my $stem (@ARGV) {
&add_to_ref($stem,\@REF) if -e $stem;
}
sub add_to_ref {
my ($file,$REF) = @_;
my $s=0;
if ($file =~ /.gz$/) {
open(REF,"gzip -dc $file|") or die "Can't read $file";
} else {
open(REF,$file) or die "Can't read $file";
}
while(<REF>) {
chop;
push @{$$REF[$s++]}, $_;
}
close(REF);
}
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
my $s=0;
while(<STDIN>) {
chop;
$_ = lc if $lowercase;
my @WORD = split;
my %REF_NGRAM = ();
my $length_translation_this_sentence = scalar(@WORD);
my ($closest_diff,$closest_length) = (9999,9999);
foreach my $reference (@{$REF[$s]}) {
# print "$s $_ <=> $reference\n";
$reference = lc($reference) if $lowercase;
my @WORD = split(' ',$reference);
my $length = scalar(@WORD);
my $diff = abs($length_translation_this_sentence-$length);
if ($diff < $closest_diff) {
$closest_diff = $diff;
$closest_length = $length;
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
} elsif ($diff == $closest_diff) {
$closest_length = $length if $length < $closest_length;
# from two references with the same closeness to me
# take the *shorter* into account, not the "first" one.
}
for(my $n=1;$n<=4;$n++) {
my %REF_NGRAM_N = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$REF_NGRAM_N{$ngram}++;
}
foreach my $ngram (keys %REF_NGRAM_N) {
if (!defined($REF_NGRAM{$ngram}) ||
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
}
}
}
}
$length_translation += $length_translation_this_sentence;
$length_reference += $closest_length;
for(my $n=1;$n<=4;$n++) {
my %T_NGRAM = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$T_NGRAM{$ngram}++;
}
foreach my $ngram (keys %T_NGRAM) {
$ngram =~ /^(\d+) /;
my $n = $1;
# my $corr = 0;
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
$TOTAL[$n] += $T_NGRAM{$ngram};
if (defined($REF_NGRAM{$ngram})) {
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
$CORRECT[$n] += $T_NGRAM{$ngram};
# $corr = $T_NGRAM{$ngram};
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
}
else {
$CORRECT[$n] += $REF_NGRAM{$ngram};
# $corr = $REF_NGRAM{$ngram};
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
}
}
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
}
}
$s++;
}
my $brevity_penalty = 1;
my $bleu = 0;
my @bleu=();
for(my $n=1;$n<=4;$n++) {
if (defined ($TOTAL[$n])){
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
}else{
$bleu[$n]=0;
}
}
if ($length_reference==0){
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
exit(1);
}
if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
}
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
my_log( $bleu[2] ) +
my_log( $bleu[3] ) +
my_log( $bleu[4] ) ) / 4) ;
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
100*$bleu,
100*$bleu[1],
100*$bleu[2],
100*$bleu[3],
100*$bleu[4],
$brevity_penalty,
$length_translation / $length_reference,
$length_translation,
$length_reference;
# print STDERR "It is in-advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
sub my_log {
return -9999999999 unless $_[0];
return log($_[0]);
}
from matplotlib import pyplot as plt
import json
from math import log
def data_readin():
lis=None
with open('./pretrain_log', 'r') as f:
for line in f.readlines():
i=json.loads(line.split('__log__:')[1])
if not lis:
lis={}
for k in i.keys():
lis[k]=[]
for k in lis.keys():
lis[k].append(i[k])
pretrain=lis.copy()
lis=None
with open('./train_log', 'r') as f:
for line in f.readlines():
i=json.loads(line.split('__log__:')[1])
if not lis:
lis={}
for k in i.keys():
lis[k]=[]
for k in lis.keys():
lis[k].append(i[k])
return pretrain, lis
# 'epoch', 'valid_cpp_mlm_ppl', 'valid_cpp_mlm_acc', 'valid_cuda_mlm_ppl', 'valid_cuda_mlm_acc', 'valid_mlm_ppl', 'valid_mlm_acc', 'test_cpp_mlm_ppl', 'test_cpp_mlm_acc', 'test_cuda_mlm_ppl', 'test_cuda_mlm_acc', 'test_mlm_ppl', 'test_mlm_acc'
# 'epoch', 'valid_cpp_sa-cuda_sa_mt_ppl', 'valid_cpp_sa-cuda_sa_mt_acc', 'valid_cpp_sa-cuda_sa_mt_bleu', 'test_cpp_sa-cuda_sa_mt_ppl', 'test_cpp_sa-cuda_sa_mt_acc', 'test_cpp_sa-cuda_sa_mt_bleu'
if __name__ == "__main__":
pretrain, train =data_readin()
fig=plt.figure(figsize=(10, 5))
plt.autoscale()
ax1=fig.add_subplot(1, 1, 1)
ax2=ax1.twinx()
ind1=0
ind2=0
for k in pretrain.keys():
# print(k)
if k=='epoch':
continue
y=pretrain[k]
if 'ppl' in k:
y=[10*log(i) for i in y]
if 'ppl' in k:
ax1.plot(y, marker='o', color='C%d'%ind1, label=k)
ind1+=1
else:
ax2.plot(y, marker='*', color='C%d'%ind2, linestyle='-.', label=k)
ind2+=1
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.savefig('./pic/tmp.png')
\ No newline at end of file
......@@ -20,29 +20,52 @@ def check_files_and_symlink_for_XLM(dataset, langs):
print("check that all files exist...")
suffixs = {"": "", ".functions_standalone": "_sa"}
for lang in langs:
for cat in ["", ".functions_standalone"]:
for cat in [".functions_standalone"]:
for i in range(8):
assert dataset.folder.joinpath(
f"{lang}.train{dataset.suffix}.{i}{cat}.bpe.pth").is_file()
assert dataset.folder.joinpath(
f"{lang}.test{dataset.suffix}{cat}.bpe.pth").is_file()
f"{lang}.train{dataset.suffix}{cat}.bpe.pth").is_file(), "{0} file error".format(f"{lang}.train{dataset.suffix}{cat}.bpe.pth")
assert dataset.folder.joinpath(
f"{lang}.test{dataset.suffix}{cat}.bpe.pth").is_file(), "{0} file error".format(f"{lang}.test{dataset.suffix}{cat}.bpe.pth")
assert dataset.folder.joinpath(
f"{lang}.valid{dataset.suffix}{cat}.bpe.pth").is_file()
XLM_folder = Path(str(dataset.folder)+'.XLM-syml')
XLM_folder.mkdir(exist_ok=True)
print("create symlinks for XLM ...")
for lang in langs:
for cat in ["", ".functions_standalone"]:
for cat in [".functions_standalone"]:
for i in range(8):
create_symlink(dataset.folder.joinpath(f"{lang}.train{dataset.suffix}.{i}{cat}.bpe.pth"),
XLM_folder.joinpath(f"train.{lang}{suffixs[cat]}.{i}.pth"))
create_symlink(dataset.folder.joinpath(f"{lang}.train{dataset.suffix}{cat}.bpe.pth"),
XLM_folder.joinpath(f"train.{lang}{suffixs[cat]}.pth"))
create_symlink(dataset.folder.joinpath(f"{lang}.test{dataset.suffix}{cat}.bpe.pth"),
XLM_folder.joinpath(f"test.{lang}{suffixs[cat]}.pth"))
create_symlink(dataset.folder.joinpath(f"{lang}.valid{dataset.suffix}{cat}.bpe.pth"),
XLM_folder.joinpath(f"valid.{lang}{suffixs[cat]}.pth"))
def preprocess(root, lang1, lang2, keep_comments, local, lang3=None, test_size=1000, ncodes=100000, size_gb=50):
subprocess.run(f"cp {XLM_folder.joinpath(f'train.{lang}_sa.pth')} {XLM_folder.joinpath(f'train.{lang}.pth')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
subprocess.run(f"cp {XLM_folder.joinpath(f'test.{lang}_sa.pth')} {XLM_folder.joinpath(f'test.{lang}.pth')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
subprocess.run(f"cp {XLM_folder.joinpath(f'valid.{lang}_sa.pth')} {XLM_folder.joinpath(f'valid.{lang}.pth')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
create_symlink(dataset.folder.joinpath(f"cuda.train.cpp_sa-cuda_sa.cuda_sa.bpe.pth"),
XLM_folder.joinpath(f"train.cpp_sa-cuda_sa.cuda_sa.pth"))
create_symlink(dataset.folder.joinpath(f"cuda.test.cpp_sa-cuda_sa.cuda_sa.bpe.pth"),
XLM_folder.joinpath(f"test.cpp_sa-cuda_sa.cuda_sa.pth"))
create_symlink(dataset.folder.joinpath(f"cuda.valid.cpp_sa-cuda_sa.cuda_sa.bpe.pth"),
XLM_folder.joinpath(f"valid.cpp_sa-cuda_sa.cuda_sa.pth"))
create_symlink(dataset.folder.joinpath(f"cpp.train.cpp_sa-cuda_sa.cpp_sa.bpe.pth"),
XLM_folder.joinpath(f"train.cpp_sa-cuda_sa.cpp_sa.pth"))
create_symlink(dataset.folder.joinpath(f"cpp.test.cpp_sa-cuda_sa.cpp_sa.bpe.pth"),
XLM_folder.joinpath(f"test.cpp_sa-cuda_sa.cpp_sa.pth"))
create_symlink(dataset.folder.joinpath(f"cpp.valid.cpp_sa-cuda_sa.cpp_sa.bpe.pth"),
XLM_folder.joinpath(f"valid.cpp_sa-cuda_sa.cpp_sa.pth"))
def preprocess(root, lang1, lang2, keep_comments, local, lang3=None, test_size=1000, ncodes=10000, size_gb=50):
if size_gb < 1:
size_gb = None
dataset = Dataset(root, lang1, lang2, keep_comments,
......@@ -66,15 +89,19 @@ def preprocess(root, lang1, lang2, keep_comments, local, lang3=None, test_size=1
dataset.train_bpe(ncodes=ncodes, size_gb=size_gb)
dataset.apply_bpe(
f'train{dataset.suffix}.[01234567].tok', use_vocab=False, executor=cluster_ex2)
dataset.apply_bpe(f'test{dataset.suffix}.tok',
dataset.apply_bpe(
f'train{dataset.suffix}.tok', use_vocab=False, executor=cluster_ex2)
dataset.apply_bpe(f'test{dataset}.functions_standalone.tok',
use_vocab=False, executor=None)
dataset.apply_bpe(f'valid{dataset.suffix}.tok',
dataset.apply_bpe(f'valid{dataset}.functions_standalone.tok',
use_vocab=False, executor=None)
dataset.get_vocab(size_gb=size_gb)
dataset.binarize_for_XLM(
f'train{dataset.suffix}.[0123456789].bpe', executor=cluster_ex2)
dataset.binarize_for_XLM(
f'train{dataset.suffix}.bpe', executor=cluster_ex2)
dataset.binarize_for_XLM(f'test{dataset.suffix}.bpe', executor=None)
dataset.binarize_for_XLM(f'valid{dataset.suffix}.bpe', executor=None)
......@@ -86,10 +113,16 @@ def preprocess(root, lang1, lang2, keep_comments, local, lang3=None, test_size=1
f'train{dataset.suffix}.[0123456789].functions_standalone.bpe', executor=cluster_ex2)
dataset.binarize_for_XLM(
f'train{dataset.suffix}.functions_standalone.bpe', executor=cluster_ex2)
dataset.binarize_for_XLM(
f'test{dataset.suffix}.functions_*.bpe', executor=None)
dataset.binarize_for_XLM(
f'valid{dataset.suffix}.functions_*.bpe', executor=None)
dataset.binarize_for_XLM(
f'*cpp_sa-cuda_sa*.bpe', executor=None)
langs = [lang1, lang2] if lang3 is None else [lang1, lang2, lang3]
check_files_and_symlink_for_XLM(dataset, langs)
......
......@@ -21,7 +21,7 @@ from preprocessing.src.timeout import timeout, TimeoutError
from sacrebleu import tokenize_v14_international
TOK_NO_SPACE_BEFORE = {',', ';'}
clang.cindex.Config.set_library_path('/usr/lib/llvm-7/lib/')
clang.cindex.Config.set_library_file('/tools/cluster-software/llvm/llvm-7.0.0/lib/libclang.so')
STRINGS_AND_COMMENTS_TOKEN_KINDS = {TokenKind.LITERAL, TokenKind.COMMENT}
logging.basicConfig(
filename='timeout_cpp_tokenizer_examples.log', level=logging.DEBUG)
......@@ -48,6 +48,10 @@ JAVA_CHAR2TOKEN = {"//": ' STOKEN0 ',
CPP_TOKEN2CHAR = JAVA_TOKEN2CHAR.copy()
CPP_CHAR2TOKEN = JAVA_CHAR2TOKEN.copy()
CUDA_TOKEN2CHAR = JAVA_TOKEN2CHAR.copy()
CUDA_CHAR2TOKEN = JAVA_CHAR2TOKEN.copy()
PYTHON_TOKEN2CHAR = {'STOKEN0': '#',
'STOKEN1': "\\n",
'STOKEN2': '"""',
......@@ -329,6 +333,36 @@ def get_cpp_tokens_and_types(s):
tokens.append((tok.spelling, tok.kind))
return tokens
def tokenize_cuda(s, keep_comments=False):
tokens = []
assert isinstance(s, str)
try:
tokens_and_types = get_cpp_tokens_and_types(s)
for tok, typ in tokens_and_types:
if not keep_comments and typ == TokenKind.COMMENT:
continue
if typ in STRINGS_AND_COMMENTS_TOKEN_KINDS:
if typ == TokenKind.COMMENT:
com = process_string(
tok, CPP_CHAR2TOKEN, CPP_TOKEN2CHAR, True)
if len(com) > 0:
tokens.append(com)
else:
tokens.append(process_string(
tok, CPP_CHAR2TOKEN, CPP_TOKEN2CHAR, False))
else:
tokens.append(tok)
return tokens
except KeyboardInterrupt:
raise
except TimeoutError:
print(f'TimeOut Error')
logging.info('*' * 20)
logging.info(f'TimeOut Error for string {s}')
return []
except:
return []
def tokenize_cpp(s, keep_comments=False):
tokens = []
......@@ -384,6 +418,60 @@ def tokenize_java(s, keep_comments=False):
return []
def detokenize_cuda(s):
assert isinstance(s, str) or isinstance(s, list)
if isinstance(s, list):
s = ' '.join(s)
# the ▁ character created bugs in the cpp tokenizer
s = s.replace('ENDCOM', '\n').replace('▁', ' SPACETOKEN ')
try:
tokens_and_types = get_cpp_tokens_and_types(s)
except:
return ''
new_tokens = []
i = 0
while i < len(tokens_and_types):
token, type = tokens_and_types[i]
if type in STRINGS_AND_COMMENTS_TOKEN_KINDS:
new_tokens.append(token.replace('STRNEWLINE', '\n').replace(
'TABSYMBOL', '\t').replace(' ', '').replace('SPACETOKEN', ' '))
if type == TokenKind.COMMENT:
new_tokens.append('NEW_LINE')
elif token == '}':
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] == ';':
new_tokens += ['CB_COLON', 'NEW_LINE']
i += 2
continue
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] == ',':
new_tokens += ['CB_COMA', 'NEW_LINE']
i += 2
continue
new_tokens += ['CB_', 'NEW_LINE']
elif token == '{':
new_tokens += ['OB_', 'NEW_LINE']
elif token == '*/':
new_tokens += ['*/', 'NEW_LINE']
elif token == ';':
new_tokens += [';', 'NEW_LINE']
else:
new_tokens.append(token)
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] in TOK_NO_SPACE_BEFORE:
next_token = tokens_and_types[i + 1][0]
new_tokens[len(new_tokens) - 1] += next_token
if next_token == ';':
new_tokens.append('NEW_LINE')
i += 2
continue
i += 1
lines = re.split('NEW_LINE', ' '.join(new_tokens))
untok_s = indent_lines(lines)
untok_s = untok_s.replace('CB_COLON', '};').replace(
'CB_COMA', '},').replace('CB_', '}').replace('OB_', '{')
return untok_s
def detokenize_cpp(s):
assert isinstance(s, str) or isinstance(s, list)
if isinstance(s, list):
......@@ -555,6 +643,8 @@ def extract_functions_java(s):
except StopIteration:
break
if 'static' in function[0:function.index('{')]:
print(function)
exit()
functions_standalone.append(
remove_java_annotation(' '.join(function)))
else:
......@@ -610,6 +700,93 @@ def clean_hashtags_functions_cpp(function):
function = function.strip()
return function
def extract_functions_cuda(s):
try:
s = clean_hashtags_functions_cpp(s)
s = s.replace('ENDCOM', '\n').replace('▁', 'SPACETOKEN')
tokens = get_cpp_tokens_and_types(s)
except:
return [], []
i = ind_iter(len(tokens))
functions_standalone = []
functions_class = []
try:
token, token_type = tokens[i.i]
except:
return [], []
while True:
try:
# detect function
if token == ')' and ((tokens[i.i + 1][0] == '{' and tokens[i.i + 2][0] != '}') or (
tokens[i.i + 1][0] == 'throw' and tokens[i.i + 4][0] == '{' and tokens[i.i + 5][0] == '}')):
# go previous until the start of function
while token not in {';', '}', '{'}:
try:
i.prev()
except StopIteration:
break
token = tokens[i.i][0]
i.next()
token, token_type = tokens[i.i]
if token_type == TokenKind.COMMENT:
token = token.strip()
token += " ENDCOM"
function = [token]
token_types = [token_type]
while token != '{':
i.next()
token, token_type = tokens[i.i]
if token_type == TokenKind.COMMENT:
token = token.strip()
token += " ENDCOM"
function.append(token)
token_types.append(token_type)
if token_types[function.index('(') - 1] != TokenKind.IDENTIFIER:
continue
if token == '{':
number_indent = 1
while not (token == '}' and number_indent == 0):
try:
i.next()
token, token_type = tokens[i.i]
if token == '{':
number_indent += 1
elif token == '}':
number_indent -= 1
if token_type == TokenKind.COMMENT:
token = token.strip()
token += " ENDCOM"
function.append(token)
except StopIteration:
break
if 'static' in function[0:function.index('{')] or '::' not in function[0:function.index('(')]:
function = ' '.join(function)
function = re.sub(
"[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function)
function = clean_hashtags_functions_cpp(function)
function = function.strip()
function = function.replace(
'\n', 'ENDCOM').replace('SPACETOKEN', '▁')
if not re.sub('[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]', "", function[:function.index('{')]).strip().startswith('{') and not function.startswith('#'):
functions_standalone.append(function)
else:
function = ' '.join(function)
function = re.sub(
"[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function)
function = clean_hashtags_functions_cpp(function)
function = function.strip()
function = function.replace(
'\n', 'ENDCOM').replace('SPACETOKEN', '▁')
if not re.sub('[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]', "", function[:function.index('{')]).strip().startswith('{') and not function.startswith('#'):
functions_class.append(function)
i.next()
token = tokens[i.i][0]
except:
break
return functions_standalone, functions_class
def extract_functions_cpp(s):
try:
......@@ -718,6 +895,26 @@ def extract_functions_cpp_with_docstring(function):
else:
return '', ''
def extract_functions_cuda_with_docstring(function):
function = re.sub("[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function)
ds = re.findall('[/][*].*?[*][/][ ]', function, re.DOTALL)
if len(ds) > 0:
for d in ds:
function = function.replace(d, '')
coms = ' '.join([d[:-1] for d in ds])
inline_coms = re.findall('[/][/].*?[E][N][D][C][O][M]', function)
for inline_com in inline_coms:
function = function.replace(inline_com, '')
coms += ' <INLINE> '
coms += inline_com
if len(re.sub(r'\W', '', coms.replace('<INLINE>', '').replace('ENDCOM', ''))) < 5:
return '', ''
else:
return re.sub('\s+', ' ', function), coms
else:
return '', ''
def remove_java_annotation(function):
return re.sub('^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*', '', function)
......@@ -737,6 +934,9 @@ def get_function_name_java(s):
def get_function_name_cpp(s):
return get_first_token_before_first_parenthesis(s)
def get_function_name_cuda(s):
return get_first_token_before_first_parenthesis(s)
def extract_arguments_java(f):
return extract_arguments_java_using_parentheses(f)
......@@ -745,6 +945,9 @@ def extract_arguments_java(f):
def extract_arguments_cpp(f):
return extract_arguments_java_using_parentheses(f)
def extract_arguments_cuda(f):
return extract_arguments_java_using_parentheses(f)
def extract_arguments_java_using_parentheses(f):
f = f.split(' ')
......
......@@ -52,7 +52,7 @@ class Language:
n_lines = get_nlines(all_tok)
# shuf
shuf_file(all_tok)
# shuf_file(all_tok)
# select test/valid/train and split train in 8
subprocess.run(f"cat {all_tok} | head -n {test_size} > {self.folder.joinpath(f'valid{suffix}.tok')}",
......@@ -60,6 +60,12 @@ class Language:
subprocess.run(f"cat {all_tok} | head -n {2 * test_size} | tail -n {test_size} > {self.folder.joinpath(f'test{suffix}.tok')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
split_len = int((n_lines - 2 * test_size) / 8)
train_len = int((n_lines - 2 * test_size))
subprocess.run(f"cat {all_tok} | tail -n {train_len} > {self.folder.joinpath(f'train{suffix}.tok')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
for n, i in zip(range(8), range(2 * test_size, n_lines, split_len)):
subprocess.run(f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len} > {self.folder.joinpath(f'train{suffix}.{n}.tok')}", shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
......@@ -94,6 +100,7 @@ class Language:
executor = LocalExecutor()
suffix = '.with_comments' if keep_comments else ''
files = list(self.folder.glob(f'train{suffix}.[01234567].tok'))
files.append(self.folder.joinpath(f'train{suffix}.tok'))
files.append(self.folder.joinpath(f'test{suffix}.tok'))
files.append(self.folder.joinpath(f'valid{suffix}.tok'))
toks = [tok for tok in files if not (tok.with_suffix('.functions_standalone.tok').is_file(
......@@ -112,6 +119,9 @@ class Language:
f'train{suffix}.[01234567].functions_class.tok'))
files += list(self.folder.glob(
f'train{suffix}.[01234567].functions_standalone.tok'))
files.append(self.folder.joinpath(f'train{suffix}.functions_class.tok'))
files.append(self.folder.joinpath(
f'train{suffix}.functions_standalone.tok'))
files.append(self.folder.joinpath(f'test{suffix}.functions_class.tok'))
files.append(self.folder.joinpath(
f'test{suffix}.functions_standalone.tok'))
......@@ -178,6 +188,7 @@ class Dataset:
size_gb_ = size_gb / len(self.langs)
nlines = [int(self.sizes[l.l][0] * size_gb_ * 1024 **
3 / self.sizes[l.l][1]) for l in self.langs]
print(nlines)
print(
f"we need to regroup {nlines} lines for {self.langs[0].l} {self.langs[1].l} and {self.langs[2].l} to gather {size_gb} Go")
# train bpe on only 50 GB (25 each lang) of the tokenized train set
......@@ -259,15 +270,31 @@ class Dataset:
job.result()
for split in ['test', 'valid']:
for f_type in ['functions_standalone', 'functions_class']:
for f_type in ['functions_standalone']:
truncate_files(l.folder.joinpath(
f'{split}{self.suffix}.{f_type}.tok') for l in self.langs)
print("apply bpe on train ... ")
self.apply_bpe(
f'train{self.suffix}.[01234567].functions_*.tok', use_vocab=False, executor=bpe_executor)
f'train{self.suffix}.[01234567].functions_standalone.tok', use_vocab=False, executor=bpe_executor)
self.apply_bpe(
f'train{self.suffix}.functions_standalone.tok', use_vocab=False, executor=bpe_executor)
print("apply bpe on test and valid ...")
self.apply_bpe(f'test{self.suffix}.functions_*.tok',
self.apply_bpe(f'test{self.suffix}.functions_standalone.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'valid{self.suffix}.functions_standalone.tok',
use_vocab=False, executor=bpe_executor)
print("apply bpe on paralle data...")
self.apply_bpe(f'train.cpp_sa-cuda_sa.cpp_sa.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'train.cpp_sa-cuda_sa.cuda_sa.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'test.cpp_sa-cuda_sa.cpp_sa.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'test.cpp_sa-cuda_sa.cuda_sa.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'valid.cpp_sa-cuda_sa.cpp_sa.tok',
use_vocab=False, executor=bpe_executor)
self.apply_bpe(f'valid{self.suffix}.functions_*.tok',
self.apply_bpe(f'valid.cpp_sa-cuda_sa.cuda_sa.tok',
use_vocab=False, executor=bpe_executor)
......@@ -44,7 +44,7 @@ def tokenize_json_helper(inpt):
@timeout(3600)
def output_all_tokenized_results(docs, f_tok):
pool = Pool(cpu_count())
pool = Pool(1)
result_content_tokenized = tqdm.tqdm(pool.imap_unordered(
tokenize_json_helper, docs), total=len(docs))
for content_tokenized, path in result_content_tokenized:
......@@ -52,7 +52,7 @@ def output_all_tokenized_results(docs, f_tok):
continue
else:
content_tokenized = ' '.join(content_tokenized)
s = f"<DOCUMENT_ID=\"{path}\"> {content_tokenized} </DOCUMENT>"
s = f"<DOCUMENT_ID=\"{path}\"> {content_tokenized} </DOCUMENT>"
# for some reason sometimes, some caracters of s
# cannot be encoded into utf-8 and it failed to print, so use try/catch
try:
......@@ -96,6 +96,22 @@ def extract_functions_file(input_path, language, test_size=None):
extract_auto_code = getattr(
code_tokenizer, f"extract_functions_{language}")
index = str(input_path).rfind('/')
suffix = str(input_path)[index:]
if "train" in suffix:
name = "train"
paral_name = "train.cpp_sa-cuda_sa.{}_sa.tok".format(language)
if "test" in suffix:
name = "test"
paral_name = "test.cpp_sa-cuda_sa.{}_sa.tok".format(language)
if "valid" in suffix:
name = "valid"
paral_name = "valid.cpp_sa-cuda_sa.{}_sa.tok".format(language)
paral_name = str(input_path)[:index+1] + paral_name
print("fuck_paral_name = ", paral_name)
f_paral_sa = open(paral_name, 'w', encoding='utf-8')
with output_path_sa.open('w', encoding='utf-8') as f_sa:
with output_path_class.open('w', encoding='utf-8') as f_class:
pool = Pool(cpu_count())
......@@ -104,11 +120,15 @@ def extract_functions_file(input_path, language, test_size=None):
extract_auto_code, lines), total=len(lines))
for func_standalone, func_class in result_functions:
for func in func_standalone:
f_paral_sa.write(name + "@@@@ | ")
f_paral_sa.write(func)
f_paral_sa.write('\n')
f_sa.write(func)
f_sa.write('\n')
for func in func_class:
f_class.write(func)
f_class.write('\n')
f_paral_sa.close()
def get_nlines(file_path):
......@@ -162,6 +182,7 @@ def shuf_file(file_path):
def apply_bpe_file(file_path, output, codes, vocab=None):
if vocab is None:
vocab = ''
print("comand = " + f"{FAST} applybpe {output} {file_path} {codes} {vocab}")
process = subprocess.run(f"{FAST} applybpe {output} {file_path} {codes} {vocab}",
shell=True,
stdout=subprocess.PIPE,
......@@ -195,6 +216,7 @@ def get_vocab_file(file_path, vocab):
def binarize_for_XLM_file(file_path, vocab):
print("binary_command = " + f"python {XLM_PP} {vocab} {file_path}")
process = subprocess.run(f"python {XLM_PP} {vocab} {file_path}",
shell=True,
stdout=subprocess.PIPE,
......@@ -234,7 +256,9 @@ def regroup_and_select_data(files, output, nlines=None):
def create_symlink(file_path, symlink):
assert file_path.is_file()
# print(file_path)
assert file_path.is_file(), "{0}, error".format(file_path)
#print(symlink)
assert not symlink.is_file()
process = subprocess.run(f"ln -s {file_path} {symlink}",
shell=True,
......
binary_command = python /lustre/S/wenyuanbo/Workspace/github/TransCoder/XLM/preprocess.py /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/vocab /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/cpp.valid.with_comments.functions_standalone.bpe
/lustre/S/wenyuanbo/Workspace/github/TransCoder/XLM/tools/fastBPE/fast applybpe /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.w
ith_comments/cuda.train.with_comments.6.bpe /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cuda/train.with_comments.6.tok /lustre/S/wenyuanbo/Works
pace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/codes
rm -rf cpp/all.with_comments.tok
rm -rf cpp/test*
rm -rf cpp/train*
rm -rf cpp/valid*
rm -rf cuda/all.with_comments.tok
rm -rf cuda/test*
rm -rf cuda/train*
rm -rf cuda/valid*
rm -rf cpp-cuda-.with_comments/
rm -rf cpp/cpp.000.with_comments.tok
rm -rf cuda/cuda.000.with_comments.tok
rm -rf cpp-cuda-.with_comments.XLM-syml/
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
# SBATCH -J test # The job name
# SBATCH -o ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
# SBATCH -e ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu,nv-gpu-hw # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --nprocs=8
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
###
### The system will alloc 8 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
# SBATCH --qos=gpu-short # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
which python3
which python
# ##- tools
# module load cluster-tools/v1.0
# module load cmake/3.15.7
# module load git/2.17.1
# module load vim/8.1.2424
# ##- language
# module load python3/3.6.8
# module load llvm/9.0.1
# module load gcc/9.3.0
# ##- cuda
# module load cuda-cudnn/11.0-8.0.4
##- virtualenv
# source xxxxx/activate
# conda init
# conda activate
#- Log information
echo $(module list) # list modules loaded
# echo $(which gcc)
echo $(which python)
echo $(which python3)
# echo $(conda list)
# echo $(pip list)
# cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
echo "Use GPU ${CUDA_VISIBLE_DEVICES}$" # which gpus
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
#- Job step
# python -m preprocessing.preprocess /lustre/S/xushangqing/TransCoder/data/train_dataset/ --lang1 cpp --lang2 cu --keep_comments False --bpe_train_size 0 --test_size 1000 --local True
# export NGPU=1; python XLM/train.py \
# --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 \
# --word_blank '0.1' --n_layers 6 --save_periodic 1 \
# --dump_path '/lustre/S/xushangqing/TransCoder/trained/' \
# --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cu_sa' --fp16 true \
# --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''\
# --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1\
# --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1\
# --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 \
# --data_path '/lustre/S/xushangqing/TransCoder/data/train_dataset/cpp-cu-.XLM-syml/'\
# --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'\
# --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cu_sa'\
# --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1'\
# --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0\
# --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false\
# --lgs 'cpp_sa-cu_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1\
# --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
sleep 8h
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17158/best-valid_mlm_ppl.pth,/lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17158/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
export WORLD_SIZE=8
# python translate.py --src_lang cpp --tgt_lang python --model_path model/model_1.pth < $1
python translate.py --src_lang cpp --tgt_lang python --model_path pretrained/model_1.pth < data/evaluation/geeks_for_geeks_successful_test_scripts/cpp/BINARY_SEARCH.cpp
# python translate.py --src_lang cpp --tgt_lang python --model_path pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth < data/evaluation/geeks_for_geeks_successful_test_scripts/cpp/BINARY_SEARCH.cpp
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only false
# jexport NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/tmp/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload true for pretrain
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 3000 --data_path '/lustre/S/xushangqing/re_TransCoder/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/pgpbjiv45a/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/pgpbjiv45a/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/' --max_len 500 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 3000 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
# python XLM/train.py \
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py\
# --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 \
# --word_blank '0.1' --n_layers 4 --save_periodic 1 \
# --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/' \
# --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true \
# --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''\
# --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1\
# --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1\
# --generate_hypothesis true --lambda_mt 1 --epoch_size 1000 \
# --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/'\
# --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'\
# --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa'\
# --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1'\
# --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0\
# --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false\
# --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1\
# --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
\ No newline at end of file
# run for train
# no ae, only mt.
cd ..
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank 0 --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/' --max_len 5120 --bptt 256 --lambda_clm 1 --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 0 --tokens_per_batch 2000 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --epoch_size 10000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --stopping_criterion 'valid_cpp_sa-cuda_sa_mt_bleu,10' --validation_metrics 'valid_cpp_sa-cuda_sa_mt_bleu' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --beam_size 1 --lg_sampling_factor '0.1' --eval_only false --exp_name debug_xsq
#export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank 0 --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/' --max_len 5120 --bptt 512 --lambda_clm 1 --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 0 --tokens_per_batch 2000 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --epoch_size 10000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --stopping_criterion 'valid_cpp_sa-cuda_sa_mt_bleu,10' --validation_metrics 'valid_cpp_sa-cuda_sa_mt_bleu' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --beam_size 1 --lg_sampling_factor '0.1' --eval_only false --exp_name debug_xsq
export NGPU=1; export CUDA_VISIBLE_DEVICES=0; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank 0 --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/mt/' --max_len 1024 --bptt 512 --lambda_clm 1 --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 0 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 1000 --epoch_size 1000 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --stopping_criterion 'valid_cpp_sa-cuda_sa_mt_bleu,10' --validation_metrics 'valid_cpp_sa-cuda_sa_mt_bleu' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --beam_size 1 --lg_sampling_factor '0.1' --eval_only false --exp_name debug_xsq
# /lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17699/best-valid_mlm_ppl.pth,/lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17699/best-valid_mlm_ppl.pth
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''
# --word_shuffle 3 --tokens_per_batch 2000 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 2000stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'
# --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/20210526/model/mlm_cpp_cuda/17569/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
#
#
# run for eval
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 1000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/bt_with_comments_sa_final_modif_test/qiff5v05p7/periodic-36.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/bt_with_comments_sa_final_modif_test/qiff5v05p7/periodic-36.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
cd -
pair=cpp-cu
OUTPATH=/lustre/S/xushangqing/TransCoder/data/train_dataset
for lg in $(echo $pair | sed -e 's/\-/ /g'); do
for split in train valid test; do
$FASTBPE applybpe $OUTPATH/$pair.$lg.$split data/wiki/para/$pair.$lg.$split $OUTPATH/codes
python preprocess.py $OUTPATH/vocab $OUTPATH/$pair.$lg.$split
done
done
\ No newline at end of file
# python -m preprocessing.preprocess /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_ori/ --lang1 cpp --lang2 python --keep_comments True --bpe_train_size 0 --test_size 10 --local True
python -m preprocessing.preprocess /lustre/S/xushangqing/re_TransCoder/data/train_dataset/ --lang1 cpp --lang2 cuda --keep_comments True --bpe_train_size 0 --test_size 10 --local True
# python XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/data/test_dataset/cpp-python-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,python' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,10' --lambda_bt 1 --dump_path './temp' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-python' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_python --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
cd ..
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,10' --lambda_bt 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/' --lambda_mt 1 --epoch_size 500 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false --max_len 1024
cd -
binary_command = python /lustre/S/wenyuanbo/Workspace/github/TransCoder/XLM/preprocess.py /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/vocab /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/cpp.valid.with_comments.functions_standalone.bpe
/lustre/S/wenyuanbo/Workspace/github/TransCoder/XLM/tools/fastBPE/fast applybpe /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.w
ith_comments/cuda.train.with_comments.6.bpe /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000_test/cuda/train.with_comments.6.tok /lustre/S/wenyuanbo/Works
pace/github/TransCoder/pldi/data_big_10000_test/cpp-cuda-.with_comments/codes
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J test # The job name
#SBATCH -o ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu,nv-gpu-hw # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:4 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
###
### The system will alloc 8 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
#SBATCH --qos=gpu-normal # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
which python3
which python
# ##- tools
# module load cluster-tools/v1.0
# module load cmake/3.15.7
# module load git/2.17.1
# module load vim/8.1.2424
# ##- language
# module load python3/3.6.8
# module load llvm/9.0.1
# module load gcc/9.3.0
# ##- cuda
# module load cuda-cudnn/11.0-8.0.4
##- virtualenv
# source xxxxx/activate
# conda init
# conda activate
#- Log information
echo $(module list) # list modules loaded
# echo $(which gcc)
echo $(which python)
echo $(which python3)
# echo $(conda list)
# echo $(pip list)
# cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
echo "Use GPU ${CUDA_VISIBLE_DEVICES}$" # which gpus
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
#- Job step
# python -m preprocessing.preprocess /lustre/S/xushangqing/TransCoder/data/train_dataset/ --lang1 cpp --lang2 cu --keep_comments False --bpe_train_size 0 --test_size 1000 --local True
# export NGPU=1; python XLM/train.py \
# --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 \
# --word_blank '0.1' --n_layers 6 --save_periodic 1 \
# --dump_path '/lustre/S/xushangqing/TransCoder/trained/' \
# --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cu_sa' --fp16 true \
# --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''\
# --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1\
# --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1\
# --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 \
# --data_path '/lustre/S/xushangqing/TransCoder/data/train_dataset/cpp-cu-.XLM-syml/'\
# --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'\
# --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cu_sa'\
# --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1'\
# --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0\
# --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false\
# --lgs 'cpp_sa-cu_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1\
# --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
sleep 8h
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
export WORLD_SIZE=8
# python translate.py --src_lang cpp --tgt_lang python --model_path model/model_1.pth < $1
python translate.py --src_lang cpp --tgt_lang python --model_path pretrained/model_1.pth < data/evaluation/geeks_for_geeks_successful_test_scripts/cpp/BINARY_SEARCH.cpp
# python translate.py --src_lang cpp --tgt_lang python --model_path pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth < data/evaluation/geeks_for_geeks_successful_test_scripts/cpp/BINARY_SEARCH.cpp
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only false
# jexport NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/tmp/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload true for pretrain
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 3000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/tmp/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/pgpbjiv45a/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/pgpbjiv45a/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
# run for train
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/checkpoint/best-valid_mlm_ppl.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/checkpoint/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
# run for eval
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 1000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/bt_with_comments_sa_final_modif_test/qiff5v05p7/periodic-36.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/bt_with_comments_sa_final_modif_test/qiff5v05p7/periodic-36.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
pair=cpp-cu
OUTPATH=/lustre/S/xushangqing/TransCoder/data/train_dataset
for lg in $(echo $pair | sed -e 's/\-/ /g'); do
for split in train valid test; do
$FASTBPE applybpe $OUTPATH/$pair.$lg.$split data/wiki/para/$pair.$lg.$split $OUTPATH/codes
python preprocess.py $OUTPATH/vocab $OUTPATH/$pair.$lg.$split
done
done
\ No newline at end of file
# python -m preprocessing.preprocess /lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_ori/ --lang1 cpp --lang2 python --keep_comments True --bpe_train_size 0 --test_size 10 --local True
python -m preprocessing.preprocess /lustre/S/xushangqing/TransCoder/data/train_dataset/ --lang1 cpp --lang2 cuda --keep_comments True --bpe_train_size 0 --test_size 1000 --local True
# python XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/data/test_dataset/cpp-python-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,python' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,10' --lambda_bt 1 --dump_path './temp' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-python' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_python --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,10' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_test_for_sep/' --lambda_mt 1 --epoch_size 10000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py \
--n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 \
--word_blank '0.1' --n_layers 6 --save_periodic 1 \
--dump_path '/lustre/S/xushangqing/TransCoder/trained/' \
--max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cu_sa' --fp16 true \
--share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''\
--word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1\
--split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1\
--generate_hypothesis true --lambda_mt 1 --epoch_size 10000 \
--data_path '/lustre/S/xushangqing/TransCoder/data/train_dataset/cpp-cu-.XLM-syml/'\
--gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'\
--eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cu_sa'\
--reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1'\
--min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0\
--word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false\
--lgs 'cpp_sa-cu_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1\
--lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
\ No newline at end of file
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only false
# jexport NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/tmp/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload true for pretrain
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 3000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py \
--n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 \
--word_blank '0.1' --n_layers 6 --save_periodic 1 \
--dump_path '/lustre/S/xushangqing/TransCoder/trained/' \
--max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cu_sa' --fp16 true \
--share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps ''\
--word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1\
--split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1\
--generate_hypothesis true --lambda_mt 1 --epoch_size 10000 \
--data_path '/lustre/S/xushangqing/TransCoder/data/train_dataset/cpp-cu-.XLM-syml/'\
--gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01'\
--eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cu_sa'\
--reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1'\
--min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0\
--word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false\
--lgs 'cpp_sa-cu_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1\
--lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
\ No newline at end of file
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J test # The job name
#SBATCH -o ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu,nv-gpu-hw # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:4 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --constraint="V100|V100S|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
#SBATCH --ntasks=1
###
### The system will alloc 8 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
#SBATCH --qos=gpu-short # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
##- tools
# module load cluster-tools/v1.0
# module load cmake/3.15.7
# module load git/2.17.1
# module load vim/8.1.2424
# ##- language
# module load python3/3.6.8
# ##- cuda
# module load cuda-cudnn/11.0-8.0.4
##- virtualenv
# source xxxxx/activate
#- Log information
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
echo "Use GPU ${CUDA_VISIBLE_DEVICES}$" # which gpus
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo $SLURM_JOB_NAME
echo $SLURM_NNODES
echo $SLURM_JOBID
echo $SLURM_NTASKS
echo $SLURM_TASKS_PER_NODE
echo $SLURM_JOB_ID
echo $SLURM_SUBMIT_DIR
echo $SLURM_NPROCS
echo $SLURM_CPUS_ON_NODE
echo $SLURM_JOB_NODELIST
echo $SLURM_JOB_CPUS_PER_NODE
echo $SLURM_SUBMIT_HOST
echo $SLURM_JOB_NUM_NODES
env
#- Job step
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch -1 --has_sentences_ids true --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 10000 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation false --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps 'cpp_sa-cuda_sa' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17158/best-valid_mlm_ppl.pth,/lustre/S/xushangqing/re_TransCoder/trained/pretrain/mlm_cpp_cuda/17158/best-valid_mlm_ppl.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only false
#pretrain
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/xushangqing/re_TransCoder/data/train_dataset/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0.1 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,10' --lambda_bt 1 --dump_path '/lustre/S/xushangqing/re_TransCoder/trained/pretrain/' --lambda_mt 1 --epoch_size 10000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
sleep 8h
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps '' --max_vocab 64000 --word_mask_keep_rand '0.8,0.1,0.1' --word_blank 0 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --save_periodic 0 --bptt 512 --lambda_clm 1 --ae_steps '' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --word_shuffle 0 --mlm_steps 'cpp,cuda' --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 100000 --stopping_criterion '_valid_mlm_ppl,6' --lambda_bt 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --lambda_mt 1 --epoch_size 100000 --early_stopping false --gelu_activation false --n_layers 6 --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0003,weight_decay=0.01' --validation_metrics _valid_mlm_ppl --eval_bleu false --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout 0 --reload_model '' --min_count 0 --lgs 'cpp-cuda' --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 0 --clip_grad_norm 5 --emb_dim 1024 --encoder_only true --beam_size 1 --clm_steps '' --exp_name mlm_cpp_cuda --lambda_ae 1 --lg_sampling_factor '-1' --eval_only false
# export NGPU=8; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only false
# jexport NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model/mlm_cpp_cuda/89qzqrcec3/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids false --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/data_big_10000/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload false
# export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab '-1' --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 60 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 30 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/tmp/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 8 --context_size 0 --word_dropout '0.1' --reload_model '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth,/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/model_big_10000_test/mlm_cpp_cuda/mifas4kg4t/checkpoint.pth' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '-1' --eval_only true
# reload true for pretrain
export NGPU=1; python -m torch.distributed.launch --nproc_per_node=$NGPU XLM/train.py --n_heads 8 --bt_steps 'cpp_sa-cuda_sa-cpp_sa' --max_vocab 10000 --word_mask_keep_rand '0.8,0.1,0.1' --gen_tpb_multiplier 1 --word_blank '0.1' --n_layers 6 --save_periodic 1 --dump_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/' --max_len 5120 --bptt 256 --lambda_clm 1 --ae_steps 'cpp_sa,cuda_sa' --fp16 true --share_inout_emb true --lambda_mlm 1 --sinusoidal_embeddings false --mlm_steps '' --word_shuffle 3 --tokens_per_batch 6000 --has_sentences_ids true --attention_dropout 0 --split_data false --length_penalty 1 --max_epoch 10000000 --stopping_criterion '' --lambda_bt 1 --generate_hypothesis true --lambda_mt 1 --epoch_size 3000 --data_path '/lustre/S/wenyuanbo/Workspace/github/TransCoder/pldi/test_for_sep/cpp-cuda-.with_comments.XLM-syml/' --gelu_activation false --split_data_accross_gpu global --optimizer 'adam_inverse_sqrt,warmup_updates=10000,lr=0.0001,weight_decay=0.01' --eval_computation true --validation_metrics '' --eval_bleu true --dropout '0.1' --mt_steps '' --reload_emb '' --batch_size 32 --context_size 0 --word_dropout '0.1' --reload_model '' --min_count 0 --eval_bleu_test_only false --group_by_size true --early_stopping false --sample_alpha 0 --word_pred '0.15' --amp 2 --max_batch_size 128 --clip_grad_norm 5 --emb_dim 1024 --encoder_only false --lgs 'cpp_sa-cuda_sa' --clm_steps '' --exp_name bt_with_comments_sa_final_modif_test --beam_size 1 --lambda_ae '0:1,100000:0.1,300000:0' --lg_sampling_factor '0.1' --eval_only true
from os import replace
from numpy.core.shape_base import block
from pandas.core.algorithms import isin
import pandas as pd
import re
from tqdm import tqdm
def fetch_func(par, ind, cont): # 传入函数参数在文件里的位置,返回函数体的始终位置;逻辑是找函数参数小括号后面的第一对大括号
t=0
for i in range(len(par)):
if par[i]>ind:
t=i
break
t0=t
lc=1
rc=0
while lc>rc:
t+=1
try:
if cont[par[t]]=='{':
lc+=1
else:
rc+=1
except Exception:
# print('exception', ind, par[t0], cont[ind-20:ind+20])
return 0, 0
return par[t0], par[t]
if __name__ == '__main__':
tet='''
__global__ void foo(int* a,
int *b){
int a;
if(condition){
int b;
}
for(int c; c<10; c++){
foo();
}
}
__global__ void reportThreadsKernel(int* threadIdsX, int* threadIdsY,
int* threadIdsZ, int* blockIdsX,
int* blockIdsY, int* blockIdsZ) {
int tid = (threadXStride * threadIdx.x) + (threadYStride * threadIdx.y) +
(threadZStride * threadIdx.z) + (blockXStride * blockIdx.x) +
(blockYStride * blockIdx.y) + (blockZStride * blockIdx.z);
} '''
pat=re.compile(r'(\b[A-Za-z0-9_\ ]+\s+\b[A-Za-z0-9_]+\s*\(.*?\))(?=\s*\{)', re.DOTALL) # 找包含函数参数的括号
pat_par=re.compile(r'\{|\}') # 匹配大括号来给fetch_func用
pat_sent=re.compile(r'\;')
# print([re.sub(r'\s+', ' ', i) for i in pat.findall(tet)])
# exit()
# df=pd.read_csv('test.csv')
# print(df.columns)
cnt=0
err=0
stop_set=set(['main', 'printf', 'operator'])
cnt1=0
sizes=[2648, 66086, 183558, 153765] # 硬编码的大小,为了显示tqdm的bar
out_data={'repo_name':[], 'cpp':[], 'cpp_title':[],'cpp_path':[], 'cuda':[], 'cuda_title':[], 'cuda_path': []}
for i in range(4):
df=pd.read_csv('consis_cuda_cpp_00000000000%d.gz'%i, compression='gzip')
# df=pd.read_csv('test.csv')
with tqdm(total=sizes[i]) as bar:
# with tqdm(total=50) as bar:
for ind, ser in df.iterrows():
cont, cont_1=ser.get('content'), ser.get('content_1')
bar.update(1)
bar.set_description('{} {}'.format(cnt, err))
if not (isinstance(cont, str) and isinstance(cont_1, str)): # 因为缩进以及元数据库的缺失,csv可能读出来会是个空
err+=1
continue
fs=pat.finditer(cont) # cpp函数
gs=pat.finditer(cont_1) # cuda函数,用finditer是为了存下来起始位置
fp=[i.start() for i in pat_par.finditer(cont)]
gp=[i.start() for i in pat_par.finditer(cont_1)]
#以下g的变量代表cuda,f的变量代表cpp
gfs=[]
gcs=[]
gts=[]
# ct=0
for g in gs:
t0, t=fetch_func(gp, g.start(), cont_1)
conti=cont_1[t0:t+1] # conti即content i,
if conti.find('block')>0 and conti.find('thread')>0 and conti.find('#')<0:
gfs.append(re.sub(r'\s+', ' ', g.group()).split('(')[0].split(' ')[-1]) #gfs即函数名及参数(去缩进防止误判)
gcs.append((t0, t+1)) #gcs即函数内容的起始偏移集合
li=re.sub(r'\s+', ' ', g.group()).split('{')[0]
gts.append(''.join([li.split('(')[0].split(' ')[-1], '(', li.split('(')[1]]))#gts即函数名集合(这里逻辑其实有点冗余)
pairs=[]
ffs=[]
for f in fs:
fi=re.sub(r'\s+', ' ', f.group()).split('(')[0].split(' ')[-1]
li=re.sub(r'\s+', ' ', f.group()).split('{')[0]
ft=''.join([li.split('(')[0].split(' ')[-1], '(', li.split('(')[1]])
# print(fi, fi in gfs)
if fi in gfs and fi not in stop_set and len(fi)>5 and (fi not in ffs): #按照cpp的函数名找配对的,这里的逻辑可以再改改
t0, t=fetch_func(fp, f.start(), cont)
if t-t0<=10 or cont[t0:t+1].find('#')>0 or t-t0-cont[t0:t+1].find(';')<10:
continue
out_data['cpp'].append(cont[t0:t+1])
out_data['cpp_title'].append(ft)
pairi=gcs[gfs.index(fi)]
out_data['cuda'].append(cont_1[pairi[0]:pairi[1]])
out_data['cuda_title'].append(gts[gfs.index(fi)])
out_data['cpp_path'].append(ser.get('path'))
out_data['cuda_path'].append(ser.get('path_1'))
out_data['repo_name'].append(ser.get('f0_'))
ffs.append(fi)
# cnt+=len(cfs)
# break
cnt=len(out_data['cpp'])
continue
# 后面的逻辑是一开始计数的时候用的
try:
fs=[re.sub(r'\s+', ' ', i).split('(')[0].split(' ')[-1] for i in\
pat.findall(cont)]
gs=[re.sub(r'\s+', ' ', i).split('(')[0].split(' ')[-1] for i in\
pat.findall(cont_1)]
except Exception:
print(cont, cont_1)
continue
fs=[i for i in fs if len(i)>0]
gs=[i for i in gs if len(i)>0]
cnt+=len(set(fs).intersection(set(gs)).difference(stop_set))
# print(fs, gs, set(fs).intersection(set(gs)).difference(stop_set))
# cnt1+=1
# if cnt1>5:
# break
print(cnt, err)
dfi=pd.DataFrame(out_data)
dfi.to_csv('out_with_title.csv')
\ No newline at end of file
import gzip
import json
with gzip.open('../data/train_dataset/cuda/cuda.000.json.gz', 'rb') as f:
cnt=0
for line in f.read().decode().split('\n'):
i=json.loads(line)
if i['content'].split('(')[0]=='BilinearSamplingForward':
cnt+1
print(cnt)
with gzip.open('../data/train_dataset/cpp/cpp.000.json.gz', 'rb') as f:
for line in f.read().decode().split('\n'):
i=json.loads(line)
if i['content'].split('(')[0]=='BilinearSamplingForward':
cnt+1
print(cnt)
with gzip.open('../data/train_dataset/cuda/cuda.000.json.gz', 'rb') as f:
cnt=0
for line in f.read().decode().split('\n'):
i=json.loads(line)
print(i['content'].split('(')[0], "###")
\ No newline at end of file
......@@ -15,6 +15,7 @@
#
import argparse
from logging import Logger, getLogger
import os
import sys
......@@ -82,22 +83,30 @@ class Translator:
self.reloaded_params['reload_model'] = ','.join([params.model_path] * 2)
encoder, decoder = build_model(self.reloaded_params, self.dico)
getLogger().info("fuck")
self.encoder = encoder[0]
getLogger().info("fuck")
self.encoder.load_state_dict(reloaded['encoder'])
getLogger().info("fuck")
assert len(reloaded['encoder'].keys()) == len(
list(p for p, _ in self.encoder.state_dict().items()))
getLogger().info("fuck")
self.decoder = decoder[0]
self.decoder.load_state_dict(reloaded['decoder'])
assert len(reloaded['decoder'].keys()) == len(
list(p for p, _ in self.decoder.state_dict().items()))
getLogger().info("fuck")
self.encoder.cuda()
self.decoder.cuda()
self.encoder.eval()
self.decoder.eval()
getLogger().info("fuck")
self.bpe_model = fastBPE.fastBPE(os.path.abspath(params.BPE_path))
getLogger().info("fuck")
def translate(self, input, lang1, lang2, n=1, beam_size=1, sample_temperature=None, device='cuda:0'):
with torch.no_grad():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment