img_patch.py

# --------------------------------------
# -*- coding: utf-8 -*- 
# @Time : 2022/8/31 16:04 
# @Author : wzy 
# @File : img_patch.py
# @reference:https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/models_mae.py
# ---------------------------------------
import math

import torch
from einops import rearrange


def patchify(img, patch_size):
    """
    &#23558;&#23436;&#25972;&#22270;&#29255;&#36716;&#25442;&#20026;&#22810;&#20010;patch&#22270;&#20687;&#22359;
    :param img:(B,C,H,W)
    :param patch_size:patch&#22359;&#30340;&#22823;&#23567;
    :return:tokens:(B,NUM,patch_size^2*C)
    """
    # &#21482;&#32771;&#34385;h=w&#19988;&#33021;&#25972;&#38500;patch_size&#30340;&#24773;&#20917;
    assert img.shape[2] == img.shape[3] and img.shape[2] % patch_size == 0
    tokens = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size)
    return tokens


def unpatchify(tokens, patch_size):
    """
    &#23558;patch&#22359;&#24674;&#22797;&#20026;&#25972;&#24352;&#22270;&#20687;
    :param tokens:(B,NUM,patch_size^2*C)
    :param patch_size:patch&#22359;&#30340;&#22823;&#23567;
    :return:img:(B,C,H,W)
    """
    _, n, d = tokens.shape
    h = w = int(math.sqrt(n))
    img = rearrange(tokens, 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=h, w=w, p1=patch_size, p2=patch_size)
    return img


def masking(tokens, mask_ratio):
    """
    &#22312;&#21069;&#38754;&#30340;&#21040;&#30340;patch&#22359;&#29983;&#25104;&#38543;&#26426;mask.(&#35813;&#37096;&#20998;&#21442;&#32771;MAE&#23448;&#26041;&#23454;&#29616;)
    :param tokens:(B,NUM,patch_size^2*C)
    :param mask_ratio:masked patches in the all patches
    :return:x_masked:&#32463;&#36807;mask&#21518;&#30340;&#36755;&#20837;&#65292;&#34920;&#31034;mask&#21518;&#21097;&#19979;&#30340;&#65292;&#29992;&#20110;encoder
    :return:mask:&#23545;patch&#36827;&#34892;&#32534;&#30721;&#65292;&#21487;&#20197;&#35270;&#20316;&#22270;&#20687;&#30340;&#20195;&#34920;&#65292;mask&#25481;&#30340;&#20026;1&#65292;&#27809;mask&#30340;&#20026;0
    :return:ids_restore:&#23384;&#20648;
    """
    b, n, d = tokens.shape
    remain_num = int(n * (1 - mask_ratio))  # &#26410;&#34987;mask&#30340;patch&#25968;&#37327;
    noise = torch.rand(b, n, device=tokens.device)  # noise in [0, 1],&#20026;&#27599;&#20010;patch&#38543;&#26426;&#19968;&#20010;&#21442;&#25968;&#65292;&#29992;&#20110;&#21518;&#32493;&#30340;&#25490;&#24207;&#21644;mask

    # &#26681;&#21095;noise&#20174;&#23567;&#21040;&#22823;&#25490;&#21015;&#65292;&#36820;&#22238;&#23545;&#24212;&#19979;&#26631;
    ids_shuffle = torch.argsort(noise, dim=1)
    # &#36824;&#21407;&#24471;&#21040;&#21407;&#26412;&#30340;noise&#39034;&#24207;(&#22937;)
    ids_restore = torch.argsort(ids_shuffle, dim=1)

    # &#26681;&#25454;&#21069;&#38754;&#24471;&#21040;&#30340;&#26410;&#34987;mask&#30340;patch&#25968;&#37327;&#65292;&#20445;&#30041;&#20854;&#23545;&#24212;&#30340;id
    ids_keep = ids_shuffle[:, :remain_num]
    # torch.gather:&#21033;&#29992;index&#26469;&#32034;&#24341;input&#29305;&#23450;&#20301;&#32622;&#30340;&#25968;&#20540;

    x_masked = torch.gather(tokens, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, d))
    print(ids_keep.unsqueeze(-1).repeat(1, 1, d).shape)

    # &#29983;&#25104; mask: 0 is keep, 1 is remove
    mask = torch.ones([b, n], device=tokens.device)
    mask[:, :remain_num] = 0  # &#27492;&#26102;&#24471;&#21040;&#30340;mask&#30697;&#38453;&#20013;&#65292;0&#20840;&#22312;&#21069;&#38754;&#65292;&#38656;&#35201;&#23558;&#36825;&#20123;&#25490;&#22312;&#21069;&#38754;&#30340;patch&#24674;&#22797;&#21040;&#21407;&#26412;&#20301;&#32622;
    # &#26681;&#25454;&#21069;&#38754;&#23384;&#20648;&#30340;&#21407;&#22987;&#20998;&#24067;ids_restore&#65292;&#33719;&#21462;&#30495;&#27491;&#30340;mask&#30697;&#38453;
    mask = torch.gather(mask, dim=1, index=ids_restore)

    return x_masked, mask, ids_restore


if __name__ == '__main__':
    img = torch.randn(1, 3, 224, 224)
    tokens = patchify(img, 16)
    print('tokens:', tokens.shape)

    img = unpatchify(tokens, 16)
    print('img:', img.shape)

    masking(tokens, 0.75)