defunicodeToAscii(s): """ 将Unicode字符串转换为纯ASCII字符(过滤非允许字符并去除重音符号) 参数: s (str): 输入的Unicode字符串 返回: str: 处理后的ASCII字符串 """ return''.join( c for c in unicodedata.normalize('NFD', s) # 对字符串进行Unicode规范化分解(NFD格式) if unicodedata.category(c) != 'Mn'# 过滤掉组合标记(如重音符号) and c in allowed_characters # 只保留允许字符集中的字符 )
# 你可以通过下面这个示例来测试 print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")
Turning Names into Tensors
现在我们已经整理好了所有的名字,我们需要将它们变成张量来使用它们。
为了表示单个字母,我们使用大小为<1 x n_letters> 的 one-hot vector。除了当前字母索引处的1外,其余填充了0,例如“b”= <0 1 0 0 0 …>。为了制作一个单词,我们将一堆单词加入到一个二维矩阵<line_length x 1 x n_letters>中。
# 查找字母在allowed_characters中的索引位置,例如"a"返回0 defletterToIndex(letter): """ 将单个字符转换为在allowed_characters中的索引 参数: letter (str): 输入字符(长度必须为1) 返回: int: 字符的索引位置(未知字符返回下划线_的位置) """ # 如果字符不在允许的字符集中,返回下划线_的索引(表示未知字符) if letter notin allowed_characters: return allowed_characters.find("_") # "_"作为OOV字符的表示 else: return allowed_characters.find(letter) # 返回字符在allowed_characters中的位置
# 将一行文本转换为形状为<line_length x 1 x n_letters>的张量 # 即每个字符用one-hot向量表示,整个行是这些向量的序列 deflineToTensor(line): """ 将字符串转换为三维张量(序列长度 x 1 x 字母表大小) 参数: line (str): 输入文本行 返回: torch.Tensor: 形状为(L,1,n_letters)的one-hot编码张量 """ # 初始化全零张量: [序列长度, 1(批处理维度), 字母表大小] tensor = torch.zeros(len(line), 1, n_letters) # 遍历字符串中的每个字符 for li, letter inenumerate(line): # 在当前字符位置对应的one-hot向量中,设置对应索引处为1 tensor[li][0][letterToIndex(letter)] = 1 return tensor # 下面这是一个示例 print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1 print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1
deftrain(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()): """ Learn on a batch of training_data for a specified number of iterations and reporting thresholds """ # Keep track of losses for plotting current_loss = 0 all_losses = [] rnn.train() optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
start = time.time() print(f"training on data set with n = {len(training_data)}")
foriterinrange(1, n_epoch + 1): rnn.zero_grad() # clear the gradients
# create some minibatches # we cannot use dataloaders because each of our names is a different length batches = list(range(len(training_data))) random.shuffle(batches) batches = np.array_split(batches, len(batches) //n_batch_size )
for idx, batch inenumerate(batches): batch_loss = 0 for i in batch: #for each example in this batch (label_tensor, text_tensor, label, text) = training_data[i] output = rnn.forward(text_tensor) loss = criterion(output, label_tensor) batch_loss += loss