Advanced Computing Platform for Theoretical Physics

Commit b325dde9 authored by Pengfei Zhou's avatar Pengfei Zhou
Browse files

randnom pattern, tang , wiketex2

parent 38b39860
......@@ -78,6 +78,7 @@ group.add_argument('--model_type',type=str,default='gru')
group.add_argument('--seed1',type=int,default=1)
group.add_argument('--seed2',type=int,default=1)
group.add_argument('--data_type',type=str,default='random_pattern')
group.add_argument('--optimizer',type=str,default='sgd')
args = parser.parse_args()
'''
if args.ingpu >= 0:
......
##for random_pattern and tang dataset
import math
import torch
import torch.nn as nn
......@@ -108,20 +109,25 @@ else:
ntokens = args.vocab_size
print(ntokens)
if args.model_type=='gru':
from model import *
model= PoetryModel(args)
if args.model_path:
from model import *
model= PoetryModel(args)
'''
if args.model_path:
model.load_state_dict(t.load(args.model_path,map_location='cpu'))
'''
elif args.model_type=='mytransf':
model=make_model(ntokens,N=args.N,d_model=args.d_model,d_ff=args.d_ff,h=args.h,dropout=0.2)
model=make_model(ntokens,N=args.N,d_model=args.d_model,d_ff=args.d_ff,h=args.h,dropout=0.2)
elif args.model_type=='pytransf':
model=TransformerModel(ntokens, args.d_model, args.h, args.d_ff, args.N, dropout)
model=TransformerModel(ntokens, args.d_model, args.h, args.d_ff, args.N, dropout)
model.to(device)
#model = TransformerModel(args.V, args.d_model, args.h, args.d_ff, args.N, dropout).to(device)
criterion = nn.CrossEntropyLoss()
lr = args.lr # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=1)
if args.optimizer=='adam':
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
elif args.optimizer=='sgd':
optimizer =torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)
import time
def train():
......@@ -157,7 +163,7 @@ def train():
output = model(data, src_mask)
loss = criterion(output.view(-1, ntokens), targets)
loss.backward()
#torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()*now_samples
......
#for wikitext dataset
import math
import torch
import torch.nn as nn
from config import args
import torch.nn.functional as F
import numpy as np
import random
from torch.utils.data import DataLoader
from model_T import *
np.random.seed(args.seed2)
random.seed(args.seed2)
torch.manual_seed(args.seed1)
torch.cuda.manual_seed_all(args.seed1)
class TransformerModel(nn.Module):
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
......@@ -16,14 +24,11 @@ class TransformerModel(nn.Module):
self.encoder = nn.Embedding(ntoken, ninp)
self.ninp = ninp
self.decoder = nn.Linear(ninp, ntoken)
self.init_weights()
def generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
......@@ -37,7 +42,6 @@ class TransformerModel(nn.Module):
output = self.decoder(output)
return output
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
......@@ -49,7 +53,6 @@ class PositionalEncoding(nn.Module):
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
......@@ -73,28 +76,43 @@ def batchify(data, bsz):
data = data.view(bsz, -1).t().contiguous()
return data.to(device)
batch_size = 1
batch_size = args.batch_size
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)
bptt = 20
bptt = args.len_seq
bptt=35
def get_batch(source, i):
seq_len = min(bptt, len(source) - 1 - i)
data = source[i:i+seq_len]
target = source[i+1:i+1+seq_len].reshape(-1)
target = source[i+1:i+1+seq_len]
return data, target
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
print(ntokens)
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
emsize = args.d_model # embedding dimension
nhid = args.d_ff # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = args.N # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = args.h # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
def make_mask(tgt,pad=0):
tgt_mask=(tgt!=pad).unsqueeze(-2)
tgt_mask=tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
return tgt_mask
if args.model_type=='pytransf':
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
elif args.model_type=='mytransf':
model=make_model(ntokens,N=args.N,d_model=args.d_model,d_ff=args.d_ff,h=args.h,dropout=0.2).to(device)
elif args.model_type=='gru':
from model import *
args.vocab_size=ntokens
model= PoetryModel(args)
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
if args.optimizer=='adam':
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
elif args.optimizer=='sgd':
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)
import time
......@@ -103,46 +121,66 @@ def train():
total_loss = 0.
start_time = time.time()
ntokens = len(TEXT.vocab.stoi)
src_mask = model.generate_square_subsequent_mask(bptt).to(device)
for batch, i in enumerate(range(0, bptt-1, bptt)):
all_bits=0
if args.model_type=='pytransf':
src_mask = model.generate_square_subsequent_mask(bptt).to(device)
for batch, i in enumerate(range(0, train_data.size(0)-1, bptt)):
data, targets = get_batch(train_data, i)
bits_batch=data.size(0)*data.size(1)
all_bits+=bits_batch
optimizer.zero_grad()
if data.size(0) != bptt:
src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
output = model(data, src_mask)
loss = criterion(output.view(-1, ntokens), targets)
if args.model_type=='pytransf':
if data.size(0) != bptt:
src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
output = model(data,src_mask)
elif args.model_type == 'mytransf':
data,targets = data.t(),targets.t()
src_mask= make_mask(data,pad=-1)
output = model(data,src_mask)
elif args.model_type=='gru':
output,_=model(data)
loss = criterion(output.reshape(-1, ntokens), targets.reshape(-1))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()
total_loss += loss.item()*bits_batch
log_interval = 200
if True :
cur_loss = total_loss / log_interval
if batch % log_interval==0:
cur_loss = total_loss / all_bits
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches | '
'lr {:02.2f} | ms/batch {:5.2f} | '
'loss {:.8g} | ppl {:.8g}'.format(
epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
elapsed * 1000 / log_interval,
cur_loss*bptt, math.exp(cur_loss)))
total_loss = 0
cur_loss, math.exp(cur_loss)))
#total_loss = 0
start_time = time.time()
print(total_loss/all_bits)
def evaluate(eval_model, data_source):
eval_model.eval() # Turn on the evaluation mode
total_loss = 0.
ntokens = len(TEXT.vocab.stoi)
all_bits=0
src_mask = model.generate_square_subsequent_mask(bptt).to(device)
with torch.no_grad():
for i in range(0, data_source.size(0) - 1, bptt):
data, targets = get_batch(data_source, i)
if data.size(0) != bptt:
all_bits+=data.size(0)*data.size(1)
if args.model_type=='pytransf':
#if data.size(0) != bptt:
src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
output = eval_model(data, src_mask)
output_flat = output.view(-1, ntokens)
total_loss += len(data) * criterion(output_flat, targets).item()
return total_loss / (len(data_source) - 1)
output = eval_model(data, src_mask)
elif args.model_type == 'mytransf':
data,targets = data.t(),targets.t()
src_mask= make_mask(data,pad=-1)
output = eval_model(data, src_mask)
elif args.model_type == 'gru':
output,_ =model(data)
output_flat = output.reshape(-1, ntokens)
total_loss += data.size(0)*data.size(1) * criterion(output_flat, targets.reshape(-1)).item()
return total_loss / all_bits
best_val_loss = float("inf")
epochs = 200 # The number of epochs
best_model = None
......@@ -150,6 +188,7 @@ best_model = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train()
'''
val_loss = evaluate(model, val_data)
print('-' * 89)
print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
......@@ -160,7 +199,7 @@ for epoch in range(1, epochs + 1):
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = model
'''
scheduler.step()
test_loss = evaluate(best_model, test_data)
print('=' * 89)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment