import onnxruntime as ort import torch from PIL import Image from clip import clip from numpy import ndarray from onnxruntime.transformers import optimizer from torch import Tensor from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode model = "clip-text-encoder.onnx" def to_numpy(tensor: Tensor, dtype=None): r: ndarray = tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() r.astype(dtype=dtype) return r if __name__ == '__main__': input_text = "by the sea" text_encoder = ort.InferenceSession(model) text_input = clip.tokenize(input_text) print("Token:", text_input, len(text_input[0])) print("InputNames", text_encoder.get_inputs()[0].name) arr = text_encoder.run(None, {text_encoder.get_inputs()[0].name: to_numpy(text_input, dtype=int)})[0] print(arr)