pytorch/text
View on GitHub[HELP] SentencePiece is not compatible with DataLoader with the Windows platform
Open
#1,076 opened on Nov 9, 2020
Windowshelp wanted
Description
We added a test to cover the compatibility between SetencePiece and DataLoader. The test passes in the Linux platform but fails under the Windows platform. We need some experts to help debug.
self = <test.experimental.test_transforms_with_asset.TestTransformsWithAsset testMethod=test_sentencepiece_with_dataloader>
def test_sentencepiece_with_dataloader(self):
sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_bpe_25000'])
spm_processor = sentencepiece_processor(sp_model_path)
_path = os.path.join(self.project_root, '.data', 'text_bpe_25000.model')
os.remove(_path)
example_strings = ['the pretrained spm model names'] * 64
ref_results = torch.tensor([[13, 1465, 12824, 304, 24935, 5771, 3776]] * 16, dtype=torch.long)
def batch_func(data):
return torch.tensor([spm_processor(text) for text in data], dtype=torch.long)
dataloader = DataLoader(example_strings, batch_size=16, num_workers=2, collate_fn=batch_func)
> for item in dataloader:
test\experimental\test_transforms_with_asset.py:185:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
env\lib\site-packages\torch\utils\data\dataloader.py:359: in __iter__
return self._get_iterator()
env\lib\site-packages\torch\utils\data\dataloader.py:301: in _get_iterator
return _MultiProcessingDataLoaderIter(self)
env\lib\site-packages\torch\utils\data\dataloader.py:885: in __init__
w.start()
env\lib\multiprocessing\process.py:105: in start
self._popen = self._Popen(self)
env\lib\multiprocessing\context.py:223: in _Popen
return _default_context.get_context().Process._Popen(process_obj)
env\lib\multiprocessing\context.py:322: in _Popen
return Popen(process_obj)
env\lib\multiprocessing\popen_spawn_win32.py:65: in __init__
reduction.dump(process_obj, to_child)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
obj = <Process(Process-1, initial daemon)>, file = <_io.BufferedWriter name=11>
protocol = None
def dump(obj, file, protocol=None):
'''Replacement for pickle.dump() using ForkingPickler.'''
> ForkingPickler(file, protocol).dump(obj)
E AttributeError: Can't pickle local object 'TestTransformsWithAsset.test_sentencepiece_with_dataloader.<locals>.batch_func'
env\lib\multiprocessing\reduction.py:60: AttributeError
cc @peterjc123 @maxluk @nbcsm @guyang3532 @gunandrose4u @smartcat2010 @mszhanyi