The thread tried to read from or write to a virtual address for which it does not have the appropriate access.
#2,777 opened on Mar 19, 2024
Repository metrics
- Stars
- (10,351 stars)
- PR merge metrics
- (Avg merge 5d 12h) (6 merged PRs in 30d)
Description
Hi,
I hope that all the provided information meets your criteria.
- Nuitka version, full Python version, flavor, OS, etc.:
> D:\>python -m nuitka --version
2.1.2 Commercial: None Python: 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] Flavor: CPython Official Executable: C:\prgFiles\Python\Python311\python.exe OS: Windows Arch: x86_64 WindowsRelease: 10 Version C compiler: ~\AppData\Local\Nuitka\Nuitka\Cache\downloads\gcc\x86_64\13.2.0-16.0.6-11.0.1-msvcrt-r1\mingw64\bin\gcc.exe (gcc 13.2.0).
- How did you install Nuitka and Python Nuitka is installed into env using pip, as well as all the other packages:
> python -m pip freeze
D:>python -m pip freeze Nuitka==2.1.2 numpy==1.23.5 pandas==2.0.2 scipy==1.10.1 tensorboard==2.12.3 torch==2.0.1 tqdm==4.65.0 transformers==4.30.2 ....
-Description of issue: Executable file crashes after few seconds. I checked in C:\Users<user_nanne>\AppData\Local\CrashDumps\PretrainMODAI.exe.6012.dmp and I came across this information:
Process Name: <path_file>\PretrainMODAI.exe Process Architecture: x64 Exception Code: 0xc0000005 Exception Information: The thread tried to read from or write to a virtual address for wich it does not have the appropriate access. Heap Information: Not Present Error Information: OS Version: 10.0.14393
- Many times when you get an error from Nuitka, your setup may be special:
Hello World test works!
- Also supply a Short, Self Contained, Correct, Example: Unfortunately, I possess three tensor files totaling 3.4 GB located within the "pretrained_tokenizer/pretrained_tensors" directory, accompanied by two JSON files (vocab.json and config.json) also located within the "pretrained_tokenizer" directory. Additionally, there are 82 text files totaling 25 MB within the "source_tokenizer" directory. Regrettably, due to their extensive size, these files cannot be provided for testing purposes.
# Test version for converting in exe file
import os,sys
import transformers
import torch
from pathlib import Path
from tqdm import tqdm
import random
from torch.utils.tensorboard import SummaryWriter
def get_config():
return {
"root": os.path.dirname(os.path.realpath(__name__)),
"file_name_source": "postgres.txt",
"folder_source": "source_ini",
"folder_source_tokenizer": "source_tokenizer",
"folder_tokenizer": "pretrained_tokenizer",
"folder_tensors": "pretrained_tensors",
"batch_size": 32,
"num_epochs": 4,
"lr": 1e-5,
"seq_len": 350,
"max_len": 512,
"model_folder": "model_results",
"model_pretrained": "pretrained_model",
"model_trained": "trained_model",
"logging_dir": "logging_dir",
"model_basename": "tmodel_",
"preload": None,
"tokenizer_file": "tokenizer_{0}.json",
"experiment_name": "runs/tmodel"
}
def get_model_folder_base_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
return str(Path('.') / model_root / model_folder)
#Folder of pretrained tensors files
def get_model_pretrained_tensors_path(config):
model_root = config["root"]
model_folder = config["folder_tokenizer"]
model_basename = config["folder_tensors"]
return str(Path('.') / model_root / model_folder / model_basename )
#Folder for pretrained model
def get_model_folder_base_pretrained_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
model_basename = config["model_pretrained"]
folder=str(Path('.') / model_root / model_folder / model_basename )
print(f"Root folder: {model_root}")
return str(Path('.') / model_root / model_folder / model_basename )
#Folder source for pretraining model
def get_folder_source_tokenizer_path(config):
model_root = config["root"]
model_basename = config["folder_source_tokenizer"]
return str(Path('.') / model_root / model_basename)
#Folder source of tokenizer
def get_tokenizer_folder_path(config):
model_root = config["root"]
model_basename = config["folder_tokenizer"]
return str(Path('.') / model_root / model_basename)
#The file json of tokenizer (vocabulary)
def get_tokenizer_file_path(config,version):
model_root = config["root"]
model_basename = config["folder_tokenizer"]
file_basename = config["tokenizer_file"].format(version)
return str(Path('.') / model_root / model_basename / f'{file_basename}')
def mlm(tensor,seed):
# create random array of floats with equal dims to tensor
torch.manual_seed(seed)
rand = torch.rand(tensor.shape)
# mask random 15% where token is not 0 <s>, 1 <unk>,2 <s/> or 4 <pad>
mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2) * (tensor != 4)
# loop through each row in tensor (cannot do in parallel)
for i in range(tensor.shape[0]):
# get indices of mask positions from mask array
selection = torch.flatten(mask_arr[i].nonzero()).tolist()
# mask tensor
tensor[i, selection] = 3 #mask id
return tensor
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings):
# store encodings internally
self.encodings = encodings
def __len__(self):
# return the number of samples
return self.encodings['input_ids'].shape[0]
def __getitem__(self, i):
# return dictionary of input_ids, attention_mask, and labels for index i
return {key: tensor[i] for key, tensor in self.encodings.items()}
def main():
config=get_config()
SEED=42#By default
torch.manual_seed(SEED)
NUMBER_OF_EPOCHS=1
"""
Pretrain model
"""
# initialize the tokenizer using the tokenizer we initialized and saved to file
##Folder with trained tokenizers
tokenizer_pretrained_folder = get_tokenizer_folder_path(config)
##Folder with splited source file
tokenizer_source_folder=get_folder_source_tokenizer_path(config)
print(f"tokenizer_source_folder: {tokenizer_source_folder}")
max_len = config["max_len"]
tokenizer = transformers.RobertaTokenizer.from_pretrained(tokenizer_pretrained_folder, max_len=max_len)
VOCAB_SIZE=tokenizer.vocab_size
print ("VOCAB_SIZE",VOCAB_SIZE)
#set folder base model
model_base_name=get_model_folder_base_path(config)
if not os.path.exists(model_base_name):
#create folder target
os.makedirs(model_base_name,exist_ok=True)
print(f"Nothing to do. The model folder [ {model_base_name} ] doesn't exists!")
#prepare pretraining folder
model_pretrainig_name=get_model_folder_base_pretrained_path(config)
os.makedirs(model_base_name,exist_ok=True)
sys.exit()
path_pretrained_model=get_model_folder_base_pretrained_path(config)
#print("path_pretrained_model: ",path_pretrained_model)
if os.path.exists(path_pretrained_model):
print(f'The folder model [ {path_pretrained_model} ] already exists!')
sys.exit()
#Source of file text for training
paths = [str(x) for x in Path(tokenizer_source_folder).glob('**/*.txt')]
# initialize lists of tensors
input_ids = []
mask = []
labels = []
# open all files, encode and add to single dataset
for path in tqdm(paths[:200]):
# :50
# open the file and split into list by newline characters
with open(path, 'r', encoding='utf-8') as fp:
lines = fp.read().split('\n')
# encode
sample = tokenizer(lines, max_length=max_len, truncation=True, padding='max_length')#config["max_len"]
# convert tokens to tensor
labels.append(torch.tensor(sample.input_ids))
# create attention mask tensor
mask.append(torch.tensor(sample.attention_mask))
# mask ~15% of tokens to create inputs
input_ids.append(mlm(labels[-1].detach().clone(),SEED))
# convert lists of tensors into tensors
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)
pretrained_tensors_path = get_model_pretrained_tensors_path(config)
if not os.path.exists(pretrained_tensors_path):
os.mkdir(pretrained_tensors_path)
#save tensors
torch.save(input_ids, f'{pretrained_tensors_path}\\input_ids.pt')
torch.save(mask, f'{pretrained_tensors_path}\\attention_mask.pt')
torch.save(labels, f'{pretrained_tensors_path}\\labels.pt')
del input_ids, mask, labels
#reload tensors from saved data
input_ids = torch.load(f'{pretrained_tensors_path}\\input_ids.pt')
mask = torch.load(f'{pretrained_tensors_path}\\attention_mask.pt')
labels = torch.load(f'{pretrained_tensors_path}\\labels.pt')
#building dataset
print("Building dataset ...")
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
#initialise dataset
dataset = Dataset(encodings)
#initilise dataloader
loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)
#create a RoBERTa config object, which will describe which features we want to initialize our RoBERTa model with.
DROPOUT_PROB = 0.15 # Set the dropout probability ( force neurons to learns more robust)
configRoBERTa = transformers.RobertaConfig(
vocab_size=VOCAB_SIZE, # we align this to the trained_tokenizer vocab
max_position_embeddings=514,
hidden_size=768, # (hidden_size must be multiple of num_attention_heads)
num_attention_heads=16,
num_hidden_layers=8,
type_vocab_size=2,
hidden_dropout_prob=DROPOUT_PROB,
attention_dropout_prob=DROPOUT_PROB
)
#define folder for pretraining model
model_folder_pretraining = get_model_folder_base_pretrained_path(config)
if not os.path.exists(model_folder_pretraining):
os.mkdir(model_folder_pretraining)
#initialize a RoBERTa model with a language modeling head.
model = transformers.RobertaForMaskedLM(configRoBERTa)
#prepare pretraining
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate pretraining mode
model.train()
# initialize optimizer
optim = transformers.AdamW(model.parameters(), lr=1e-5)
# optionally, if using tensorboard, initialize writer object
writer = SummaryWriter(log_dir=model_folder_pretraining)
epochs = NUMBER_OF_EPOCHS # pretrained for 4 ideally
step = 0
# Pretraining loop by epochs from here ....
print ("Done")
if __name__ == "__main__":
main()
- Provide in your issue the Nuitka options used:
py -m nuitka --standalone --nofollow-import-to=tensorflow --nofollow-import-to=keras --noinclude-default-mode=warning --noinclude-numba-mode=nofollow --module-parameter=numba-disable-jit=yes --assume-yes-for-downloads --module-parameter=torch-disable-jit=yes --include-data-dir=../PretrainTestNuitka/model_results=model_results --include-data-dir=../PretrainTestNuitka/pretrained_tokenizer=pretrained_tokenizer --include-data-dir=../PretrainTestNuitka/source_tokenizer=source_tokenizer MyScript2Test.py
-Messages during conversion:
Nuitka-Options: Used command line options: --standalone --nofollow-import-to=tensorflow --nofollow-import-to=keras Nuitka-Options: --noinclude-default-mode=warning --noinclude-numba-mode=nofollow --module-parameter=numba-disable-jit=yes Nuitka-Options: --assume-yes-for-downloads --module-parameter=torch-disable-jit=yes Nuitka-Options: --include-data-dir=../PretrainTestNuitka/model_results=model_results Nuitka-Options: --include-data-dir=../PretrainTestNuitka/pretrained_tokenizer=pretrained_tokenizer Nuitka-Options: --include-data-dir=../PretrainTestNuitka/source_tokenizer=source_tokenizer MyScript2Test.py Nuitka: Starting Python compilation with Nuitka '2.1.2' on Python '3.11' commercial grade 'not installed'. Nuitka-Options:WARNING: No data files in directory '../PretrainTestNuitka/model_results.' Nuitka: Completed Python level compilation and optimization. Nuitka: Generating source code for C backend compiler. Nuitka: Running data composer tool for optimal constant value handling. Nuitka: Running C compilation via Scons. Nuitka-Scons: Backend C compiler: gcc (gcc 13.2.0). Nuitka-Scons: Backend linking program with 4768 files (no progress information available for this stage). Nuitka-Scons: Compiled 4738 C files using ccache. Nuitka-Scons: Cached C files (using ccache) with result 'cache hit': 4736 Nuitka-Scons: Cached C files (using ccache) with result 'cache miss': 2 Nuitka-Options: Included data file 'pretrained_tokenizer\config.json' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included data file 'pretrained_tokenizer\merges.txt' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included data file 'pretrained_tokenizer\vocab.json' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included data file 'pretrained_tokenizer\pretrained_tensors\attention_mask.pt' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included data file 'pretrained_tokenizer\pretrained_tensors\input_ids.pt' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included data file 'pretrained_tokenizer\pretrained_tensors\labels.pt' due to specified data dir Nuitka-Options: '../PretrainTestNuitka/pretrained_tokenizer' on command line. Nuitka-Options: Included 82 data files due to specified data dir '../PretrainTestNuitka/source_tokenizer' on command line. Nuitka-Plugins:data-files: Included data file 'certifi\cacert.pem' due to package data for 'certifi'. Nuitka-Plugins:data-files: Included data file 'grpc_cython_credentials\roots.pem' due to package data for 'grpc'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\html.tpl' due to package data directory 'templates' for Nuitka-Plugins:data-files: 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\html_style.tpl' due to package data directory Nuitka-Plugins:data-files: 'templates' for 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\html_table.tpl' due to package data directory Nuitka-Plugins:data-files: 'templates' for 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\latex.tpl' due to package data directory 'templates' for Nuitka-Plugins:data-files: 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\latex_longtable.tpl' due to package data directory Nuitka-Plugins:data-files: 'templates' for 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\latex_table.tpl' due to package data directory Nuitka-Plugins:data-files: 'templates' for 'pandas.io.formats'. Nuitka-Plugins:data-files: Included data file 'pandas\io\formats\templates\string.tpl' due to package data directory 'templates' Nuitka-Plugins:data-files: for 'pandas.io.formats'. Nuitka-Plugins:data-files: Included 602 data files due to package data directory 'zoneinfo' for 'pytz'. Nuitka-Plugins:data-files: Included data file 'scipy\stats_sobol_direction_numbers.npz' due to package data for 'scipy'. Nuitka-Plugins:data-files: Included 8101 data files due to package data directory 'include' for 'torch'. Nuitka-Plugins:dll-files: Found 1 file DLLs from numpy installation. Nuitka-Plugins:dll-files: Found 2 files DLLs from sklearn._distributor_init installation. Nuitka-Plugins:dll-files: Found 11 files DLLs from torch installation. Nuitka: Keeping build directory 'MyScript2Test.build'. Nuitka: Successfully created 'MyScript2Test.dist\MyScript2Test.exe'.