extractor.py

# -*- coding: utf-8 -*-
"""Experiments_on_CLIP_vieCap4H_ViDRSNet.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Fp647OiEt-5MC8xYbtenMhgR18zmPDMP
"""

# from google.colab import drive
# drive.mount('/content/drive')

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/drive/MyDrive/viecap4h-experiments

# !pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP

"""## VinVL region features extraction"""

# !python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9'
# !pip install fvcore

# !pip install ipython h5py nltk joblib jupyter pandas scipy
# !pip install ninja yacs>=0.1.8 cython matplotlib tqdm opencv-python numpy>=1.19.5
# !pip install install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
# !pip install timm einops
# !pip install pycocotools
# !pip install cityscapesscripts

# Commented out IPython magic to ensure Python compatibility.
# !git clone https://github.com/microsoft/scene_graph_benchmark
# %cd scene_graph_benchmark
# !python setup.py build develop

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/drive/MyDrive/viecap4h-experiments/scene_graph_benchmark
# !wget https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/drive/MyDrive/viecap4h-experiments/scene_graph_benchmark
import sys; sys.path.append('./scene_graph_benchmark/scene_graph_benchmark')
from scene_graph_benchmark.scene_parser import SceneParser
from scene_graph_benchmark.AttrRCNN import AttrRCNN
from maskrcnn_benchmark.data.transforms import build_transforms
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.config import cfg
from scene_graph_benchmark.config import sg_cfg
from maskrcnn_benchmark.data.datasets.utils.load_files import \
    config_dataset_file
from maskrcnn_benchmark.data.datasets.utils.load_files import load_labelmap_file
from maskrcnn_benchmark.utils.miscellaneous import mkdir

import torch
import clip
from PIL import Image

#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
clip_model, preprocess = clip.load("ViT-L/14", device=device)


def extract_clip_feature(image, device):
    # image = preprocess(image).unsqueeze(0).to(device)
    # image = image.type(torch.HalfTensor).to(device)
    # with torch.no_grad():
    #     x = clip_model.visual.conv1(image)  # shape = [*, width, grid, grid]
    image = preprocess(image).unsqueeze(0).to(device)
    image = image.type(torch.FloatTensor).to(device)  
    with torch.no_grad():
        x = clip_model.visual.conv1(image)
        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x = torch.cat([clip_model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        x = x + clip_model.visual.positional_embedding.to(x.dtype)
        x = clip_model.visual.ln_pre(x)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = clip_model.visual.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = clip_model.visual.ln_post(x[:, 0, :])
    return x.view(x.size(0), 64, -1)

import os
import glob
import cv2
import torch
from PIL import Image
import numpy as np
#import tqdm
import json
import h5py
import torch.nn as nn
from tqdm import tqdm

#Setting configuration
cfg.set_new_allowed(True)
cfg.merge_from_other_cfg(sg_cfg)
cfg.set_new_allowed(False)
#Configuring VinVl
cfg.merge_from_file('./scene_graph_benchmark/sgg_configs/vgattr/vinvl_x152c4.yaml')
argument_list = [
                 'MODEL.WEIGHT', './../model/vinvl_vg_x152c4.pth',
                 'MODEL.ROI_HEADS.NMS_FILTER', 1,
                 'MODEL.ROI_HEADS.SCORE_THRESH', 0.2,
                 'TEST.IGNORE_BOX_REGRESSION', False,
                 'MODEL.ATTRIBUTE_ON', True,
                 'MODEL.DEVICE', 'cuda:0',
                 'TEST.OUTPUT_FEATURE', True,
]

cfg.merge_from_list(argument_list)
cfg.freeze()

output_dir = cfg.OUTPUT_DIR

model = AttrRCNN(cfg)
model.to(cfg.MODEL.DEVICE)
model.eval()

transforms = build_transforms(cfg, is_train=False)
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
checkpointer.load(cfg.MODEL.WEIGHT)

def cv2Img_to_Image(input_img):
    cv2_img = input_img.copy()
    img = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    return img

import numpy as np

# Extract features

dict_viecap4h_set = {
    'test': './../dataset/test',
}

import torch
torch.cuda.empty_cache()

for viecap4h_set in dict_viecap4h_set:
    print('Extracting', viecap4h_set)
    save_dir = './../model/feature'
    save_hdf5 = h5py.File(os.path.join(save_dir, 'demo_' + viecap4h_set + '_clip_features.hdf5'), 'w')
    dir_image = dict_viecap4h_set[viecap4h_set]
    #img_paths = glob.glob(os.path.join(dir_image, '*'))
    img_path = './../dataset/test/1.jpeg'
    grid_size = 8

    #for img_path in tqdm(img_paths):

    dict_img = {}
    filename = os.path.basename(img_path)
    try:
        image = cv2.imread(img_path)
        img_input = cv2Img_to_Image(image)
    except:
        image = Image.open(img_path)
        img_input = np.array(image)
        if img_input.shape[-1] < 3:
            img_input = cv2.cvtColor(img_input, cv2.COLOR_GRAY2RGB)
        img_input = cv2Img_to_Image(img_input)

    img_fit_to_get_clip = img_input
    img_input, _ = transforms(img_input, target=None)
    img_input = img_input.to(cfg.MODEL.DEVICE)
    raw_height, raw_width = img_input.shape[-2:]

    with torch.no_grad():
        prediction = model(img_input.type(torch.FloatTensor))[0].to('cpu')

    prediction = prediction.resize((raw_width, raw_height))
    det_dict = {key : prediction.get_field(key) for key in prediction.fields()}
    box_features = det_dict['box_features']

    # Get boxes
    boxes_all = torch.zeros(box_features.size(0), 4)
    max_indexes = np.argmax(det_dict['scores_all'], axis=-1)
    for idx, (max_idx, box) in enumerate(zip(max_indexes, det_dict['boxes_all'])):
        boxes_all[idx] = box[max_idx]

    # Get CLIP features
    clip_grid_features = extract_clip_feature(img_fit_to_get_clip, 'cpu') # cuda:0

    save_hdf5.create_dataset(filename.split('.')[0] + '_features', data=box_features.cpu().detach().numpy())
    #save_hdf5.create_dataset(filename.split('.')[0] + '_boxes', data=boxes_all.cpu().detach().numpy())
    save_hdf5.create_dataset(filename.split('.')[0] + '_grids', data=clip_grid_features.cpu().detach().numpy())

    save_hdf5.close()