Skip to content

Instantly share code, notes, and snippets.

View xiabingquan's full-sized avatar

xiabingquan xiabingquan

View GitHub Profile
@xiabingquan
xiabingquan / intert_white_spaces.py
Last active August 26, 2024 15:21
A super easy (but useful) script to insert whitespaces to English words which are adjacent to Chinese.
import re
import sys
def insert_spaces(text):
text = re.sub(r'([a-zA-Z])([\u4e00-\u9fa5])', r'\1 \2', text)
text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z])', r'\1 \2', text)
return text
if __name__ == '__main__':
@xiabingquan
xiabingquan / bpe_tokenizer_from_scratch.py
Last active August 21, 2024 08:27
Building a BPE (Bpte-Pair Encoding) tokenizer from scratch.
# A minimal example of how to implement byte-pair encoding (BPE) tokenizer from scratch in Python.
# Reference: https://github.com/karpathy/minbpe
# Contact: [email protected]
def get_stats(byte_arr):
# get the frequency of each byte pair in the text
count = {}
for pair in zip(byte_arr[:-1], byte_arr[1:]): # e.g. pair: (b'a', b' ')
count[pair] = count.get(pair, 0) + 1
@xiabingquan
xiabingquan / flash_attention_in_numpy.py
Last active November 1, 2024 14:08
An toy example of flash attention implemented in Numpy.
# A minimal exmaple of flash attention implemented in Numpy
# Contact: bingquanxia AT qq.com
import unittest
from typing import List
import numpy as np
import torch
@xiabingquan
xiabingquan / transformer_all_in_one.py
Created December 6, 2023 14:52
Implement Transformer from scratach. All modules included in one file!
# coding=utf-8
# Contact: [email protected]
import numpy as np
import torch
import torch.nn as nn
def get_len_mask(b: int, max_len: int, feat_lens: torch.Tensor, device: torch.device) -> torch.Tensor:
@xiabingquan
xiabingquan / visualize_edit_distance.py
Last active December 12, 2022 06:10
Visualize errors of Edit Distance (Insertion, Deletion and Replacement)
from argparse import ArgumentParser
import editdistance
from rich.text import Text
from rich.console import Console
def edit_dist_dp(gt, hyp):
"""
A Dynamic Programming based Python program for edit distance problem