IGEV/IGEV-MVS/evaluate_mvs.py

451 lines
21 KiB
Python
Raw Permalink Normal View History

2023-03-20 19:52:04 +08:00
import argparse
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
import time
from datasets import find_dataset_def
from core.igev_mvs import IGEVMVS
from utils import *
import sys
import cv2
from datasets.data_io import read_pfm, save_pfm
from core.submodule import depth_unnormalization
from plyfile import PlyData, PlyElement
from tqdm import tqdm
from PIL import Image
cudnn.benchmark = True
parser = argparse.ArgumentParser(description='Predict depth, filter, and fuse')
parser.add_argument('--model', default='IterMVS', help='select model')
parser.add_argument('--dataset', default='dtu_yao_eval', help='select dataset')
2023-03-20 20:05:10 +08:00
parser.add_argument('--testpath', default='/data/DTU_data/dtu_test/', help='testing data path')
2023-03-20 19:52:04 +08:00
parser.add_argument('--testlist', default='./lists/dtu/test.txt', help='testing scan list')
parser.add_argument('--maxdisp', default=256)
parser.add_argument('--split', default='intermediate', help='select data')
parser.add_argument('--batch_size', type=int, default=2, help='testing batch size')
parser.add_argument('--n_views', type=int, default=5, help='num of view')
parser.add_argument('--img_wh', nargs='+', type=int, default=[640, 480],
help='height and width of the image')
parser.add_argument('--loadckpt', default='./pretrained_models/dtu.ckpt', help='load a specific checkpoint')
parser.add_argument('--outdir', default='./output/', help='output dir')
parser.add_argument('--display', action='store_true', help='display depth images and masks')
parser.add_argument('--iteration', type=int, default=32, help='num of iteration of GRU')
parser.add_argument('--geo_pixel_thres', type=float, default=1, help='pixel threshold for geometric consistency filtering')
parser.add_argument('--geo_depth_thres', type=float, default=0.01, help='depth threshold for geometric consistency filtering')
parser.add_argument('--photo_thres', type=float, default=0.3, help='threshold for photometric consistency filtering')
# parse arguments and check
args = parser.parse_args()
print("argv:", sys.argv[1:])
print_args(args)
if args.dataset=="dtu_yao_eval":
img_wh=(1600, 1152)
elif args.dataset=="tanks":
img_wh=(1920, 1024)
elif args.dataset=="eth3d":
img_wh = (1920,1280)
else:
img_wh = (args.img_wh[0], args.img_wh[1]) # custom dataset
# read intrinsics and extrinsics
def read_camera_parameters(filename):
with open(filename) as f:
lines = f.readlines()
lines = [line.rstrip() for line in lines]
# extrinsics: line [1,5), 4x4 matrix
extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
# intrinsics: line [7-10), 3x3 matrix
intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
return intrinsics, extrinsics
# read an image
def read_img(filename, img_wh):
img = Image.open(filename)
# scale 0~255 to 0~1
np_img = np.array(img, dtype=np.float32) / 255.
original_h, original_w, _ = np_img.shape
np_img = cv2.resize(np_img, img_wh, interpolation=cv2.INTER_LINEAR)
return np_img, original_h, original_w
# save a binary mask
def save_mask(filename, mask):
assert mask.dtype == np.bool_
mask = mask.astype(np.uint8) * 255
Image.fromarray(mask).save(filename)
def save_depth_img(filename, depth):
# assert mask.dtype == np.bool
depth = depth.astype(np.float32) * 255
Image.fromarray(depth).save(filename)
def read_pair_file(filename):
data = []
with open(filename) as f:
num_viewpoint = int(f.readline())
# 49 viewpoints
for view_idx in range(num_viewpoint):
ref_view = int(f.readline().rstrip())
src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
if len(src_views) != 0:
data.append((ref_view, src_views))
return data
# run MVS model to save depth maps
def save_depth():
# dataset, dataloader
MVSDataset = find_dataset_def(args.dataset)
if args.dataset=="dtu_yao_eval":
test_dataset = MVSDataset(args.testpath, args.testlist, args.n_views, img_wh)
elif args.dataset=="tanks":
test_dataset = MVSDataset(args.testpath, args.n_views, img_wh, args.split)
elif args.dataset=="eth3d":
test_dataset = MVSDataset(args.testpath, args.split, args.n_views, img_wh)
else:
test_dataset = MVSDataset(args.testpath, args.n_views, img_wh)
TestImgLoader = DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=4, drop_last=False)
# model
model = IGEVMVS(args)
model = nn.DataParallel(model)
model.cuda()
# load checkpoint file specified by args.loadckpt
print("loading model {}".format(args.loadckpt))
state_dict = torch.load(args.loadckpt)
model.load_state_dict(state_dict['model'])
model.eval()
with torch.no_grad():
tbar = tqdm(TestImgLoader)
for batch_idx, sample in enumerate(tbar):
start_time = time.time()
sample_cuda = tocuda(sample)
disp_prediction = model(sample_cuda["imgs"], sample_cuda["proj_matrices"],
sample_cuda["depth_min"], sample_cuda["depth_max"], test_mode=True)
b = sample_cuda["depth_min"].shape[0]
inverse_depth_min = (1.0 / sample_cuda["depth_min"]).view(b, 1, 1, 1)
inverse_depth_max = (1.0 / sample_cuda["depth_max"]).view(b, 1, 1, 1)
depth_prediction = depth_unnormalization(disp_prediction, inverse_depth_min, inverse_depth_max)
depth_prediction = tensor2numpy(depth_prediction.float())
del sample_cuda, disp_prediction
tbar.set_description('Iter {}/{}, time = {:.3f}'.format(batch_idx, len(TestImgLoader), time.time() - start_time))
filenames = sample["filename"]
# save depth maps and confidence maps
for filename, depth_est in zip(filenames, depth_prediction):
depth_filename = os.path.join(args.outdir, filename.format('depth_est', '.pfm'))
os.makedirs(depth_filename.rsplit('/', 1)[0], exist_ok=True)
# save depth maps
depth_est = np.squeeze(depth_est, 0)
save_pfm(depth_filename, depth_est)
# project the reference point cloud into the source view, then project back
def reproject_with_depth(depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src, extrinsics_src):
width, height = depth_ref.shape[1], depth_ref.shape[0]
## step1. project reference pixels to the source view
# reference view x, y
x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
x_ref, y_ref = x_ref.reshape([-1]), y_ref.reshape([-1])
# reference 3D space
xyz_ref = np.matmul(np.linalg.inv(intrinsics_ref),
np.vstack((x_ref, y_ref, np.ones_like(x_ref))) * depth_ref.reshape([-1]))
# source 3D space
xyz_src = np.matmul(np.matmul(extrinsics_src, np.linalg.inv(extrinsics_ref)),
np.vstack((xyz_ref, np.ones_like(x_ref))))[:3]
# source view x, y
K_xyz_src = np.matmul(intrinsics_src, xyz_src)
xy_src = K_xyz_src[:2] / K_xyz_src[2:3]
## step2. reproject the source view points with source view depth estimation
# find the depth estimation of the source view
x_src = xy_src[0].reshape([height, width]).astype(np.float32)
y_src = xy_src[1].reshape([height, width]).astype(np.float32)
sampled_depth_src = cv2.remap(depth_src, x_src, y_src, interpolation=cv2.INTER_LINEAR)
# mask = sampled_depth_src > 0
# source 3D space
# NOTE that we should use sampled source-view depth_here to project back
xyz_src = np.matmul(np.linalg.inv(intrinsics_src),
np.vstack((xy_src, np.ones_like(x_ref))) * sampled_depth_src.reshape([-1]))
# reference 3D space
xyz_reprojected = np.matmul(np.matmul(extrinsics_ref, np.linalg.inv(extrinsics_src)),
np.vstack((xyz_src, np.ones_like(x_ref))))[:3]
# source view x, y, depth
depth_reprojected = xyz_reprojected[2].reshape([height, width]).astype(np.float32)
K_xyz_reprojected = np.matmul(intrinsics_ref, xyz_reprojected)
xy_reprojected = K_xyz_reprojected[:2] / (K_xyz_reprojected[2:3]+1e-6)
x_reprojected = xy_reprojected[0].reshape([height, width]).astype(np.float32)
y_reprojected = xy_reprojected[1].reshape([height, width]).astype(np.float32)
return depth_reprojected, x_reprojected, y_reprojected, x_src, y_src
def check_geometric_consistency(depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src, extrinsics_src, thre1, thre2):
width, height = depth_ref.shape[1], depth_ref.shape[0]
x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
depth_reprojected, x2d_reprojected, y2d_reprojected, x2d_src, y2d_src = reproject_with_depth(depth_ref,
intrinsics_ref,
extrinsics_ref,
depth_src,
intrinsics_src,
extrinsics_src)
# check |p_reproj-p_1| < 1
dist = np.sqrt((x2d_reprojected - x_ref) ** 2 + (y2d_reprojected - y_ref) ** 2)
# check |d_reproj-d_1| / d_1 < 0.01
depth_diff = np.abs(depth_reprojected - depth_ref)
relative_depth_diff = depth_diff / depth_ref
masks=[]
for i in range(2,11):
mask = np.logical_and(dist < i/thre1, relative_depth_diff < i/thre2)
masks.append(mask)
depth_reprojected[~mask] = 0
return masks, mask, depth_reprojected, x2d_src, y2d_src
def filter_depth(scan_folder, out_folder, plyfilename, geo_pixel_thres, geo_depth_thres, photo_thres, img_wh, geo_mask_thres=3):
# the pair file
pair_file = os.path.join(scan_folder, "pair.txt")
# for the final point cloud
vertexs = []
vertex_colors = []
pair_data = read_pair_file(pair_file)
nviews = len(pair_data)
thre_left = -2
thre_right = 2
total_iter = 10
for iter in range(total_iter):
thre = (thre_left + thre_right) / 2
print(f"{iter} {10 ** thre}")
depth_est_averaged = []
geo_mask_all = []
# for each reference view and the corresponding source views
for ref_view, src_views in pair_data:
# load the camera parameters
ref_intrinsics, ref_extrinsics = read_camera_parameters(
os.path.join(scan_folder, 'cams_1/{:0>8}_cam.txt'.format(ref_view)))
ref_img, original_h, original_w = read_img(os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(ref_view)), img_wh)
ref_intrinsics[0] *= img_wh[0]/original_w
ref_intrinsics[1] *= img_wh[1]/original_h
# load the estimated depth of the reference view
ref_depth_est = read_pfm(os.path.join(out_folder, 'depth_est/{:0>8}.pfm'.format(ref_view)))[0]
ref_depth_est = np.squeeze(ref_depth_est, 2)
all_srcview_depth_ests = []
# compute the geometric mask
geo_mask_sum = 0
geo_mask_sums=[]
n = 1 + len(src_views)
ct = 0
for src_view in src_views:
ct = ct + 1
# camera parameters of the source view
src_intrinsics, src_extrinsics = read_camera_parameters(
os.path.join(scan_folder, 'cams_1/{:0>8}_cam.txt'.format(src_view)))
_, original_h, original_w = read_img(os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(src_view)), img_wh)
src_intrinsics[0] *= img_wh[0]/original_w
src_intrinsics[1] *= img_wh[1]/original_h
# the estimated depth of the source view
src_depth_est = read_pfm(os.path.join(out_folder, 'depth_est/{:0>8}.pfm'.format(src_view)))[0]
masks, geo_mask, depth_reprojected, _, _ = check_geometric_consistency(ref_depth_est, ref_intrinsics, ref_extrinsics,
src_depth_est,
src_intrinsics, src_extrinsics, 10 ** thre * 4, 10 ** thre * 1300)
if (ct==1):
for i in range(2,n):
geo_mask_sums.append(masks[i-2].astype(np.int32))
else:
for i in range(2,n):
geo_mask_sums[i-2]+=masks[i-2].astype(np.int32)
geo_mask_sum+=geo_mask.astype(np.int32)
all_srcview_depth_ests.append(depth_reprojected)
geo_mask=geo_mask_sum>=n
for i in range (2,n):
geo_mask=np.logical_or(geo_mask,geo_mask_sums[i-2]>=i)
depth_est_averaged.append((sum(all_srcview_depth_ests) + ref_depth_est) / (geo_mask_sum + 1))
geo_mask_all.append(np.mean(geo_mask))
final_mask = geo_mask
if iter == total_iter - 1:
os.makedirs(os.path.join(out_folder, "mask"), exist_ok=True)
save_mask(os.path.join(out_folder, "mask/{:0>8}_geo.png".format(ref_view)), geo_mask)
save_mask(os.path.join(out_folder, "mask/{:0>8}_final.png".format(ref_view)), final_mask)
print("processing {}, ref-view{:0>2}, geo_mask:{:3f} final_mask: {:3f}".format(scan_folder, ref_view,
geo_mask.mean(), final_mask.mean()))
if args.display:
cv2.imshow('ref_img', ref_img[:, :, ::-1])
cv2.imshow('ref_depth', ref_depth_est / np.max(ref_depth_est))
cv2.imshow('ref_depth * geo_mask', ref_depth_est * geo_mask.astype(np.float32) / np.max(ref_depth_est))
cv2.imshow('ref_depth * mask', ref_depth_est * final_mask.astype(np.float32) / np.max(ref_depth_est))
cv2.waitKey(0)
height, width = depth_est_averaged[-1].shape[:2]
x, y = np.meshgrid(np.arange(0, width), np.arange(0, height))
valid_points = final_mask
# print("valid_points", valid_points.mean())
x, y, depth = x[valid_points], y[valid_points], depth_est_averaged[-1][valid_points]
color = ref_img[valid_points]
xyz_ref = np.matmul(np.linalg.inv(ref_intrinsics),
np.vstack((x, y, np.ones_like(x))) * depth)
xyz_world = np.matmul(np.linalg.inv(ref_extrinsics),
np.vstack((xyz_ref, np.ones_like(x))))[:3]
vertexs.append(xyz_world.transpose((1, 0)))
vertex_colors.append((color * 255).astype(np.uint8))
if np.mean(geo_mask_all) >= 0.25:
thre_left = thre
else:
thre_right = thre
vertexs = np.concatenate(vertexs, axis=0)
vertex_colors = np.concatenate(vertex_colors, axis=0)
vertexs = np.array([tuple(v) for v in vertexs], dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
vertex_colors = np.array([tuple(v) for v in vertex_colors], dtype=[('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])
vertex_all = np.empty(len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr)
for prop in vertexs.dtype.names:
vertex_all[prop] = vertexs[prop]
for prop in vertex_colors.dtype.names:
vertex_all[prop] = vertex_colors[prop]
el = PlyElement.describe(vertex_all, 'vertex')
PlyData([el]).write(plyfilename)
print("saving the final model to", plyfilename)
if __name__ == '__main__':
save_depth()
if args.dataset=="dtu_yao_eval":
with open(args.testlist) as f:
scans = f.readlines()
scans = [line.rstrip() for line in scans]
for scan in scans:
scan_id = int(scan[4:])
scan_folder = os.path.join(args.testpath, scan)
out_folder = os.path.join(args.outdir, scan)
filter_depth(scan_folder, out_folder, os.path.join(args.outdir, 'igev_mvs{:0>3}_l3.ply'.format(scan_id)),
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, 4)
elif args.dataset=="tanks":
# intermediate dataset
if args.split == "intermediate":
scans = ['Family', 'Francis', 'Horse', 'Lighthouse',
'M60', 'Panther', 'Playground', 'Train']
geo_mask_thres = {'Family': 5,
'Francis': 6,
'Horse': 5,
'Lighthouse': 6,
'M60': 5,
'Panther': 5,
'Playground': 5,
'Train': 5}
for scan in scans:
scan_folder = os.path.join(args.testpath, args.split, scan)
out_folder = os.path.join(args.outdir, scan)
filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'),
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
# advanced dataset
elif args.split == "advanced":
scans = ['Auditorium', 'Ballroom', 'Courtroom',
'Museum', 'Palace', 'Temple']
geo_mask_thres = {'Auditorium': 3,
'Ballroom': 4,
'Courtroom': 4,
'Museum': 4,
'Palace': 5,
'Temple': 4}
for scan in scans:
scan_folder = os.path.join(args.testpath, args.split, scan)
out_folder = os.path.join(args.outdir, scan)
filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'),
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
elif args.dataset=="eth3d":
if args.split == "test":
scans = ['botanical_garden', 'boulders', 'bridge', 'door',
'exhibition_hall', 'lecture_room', 'living_room', 'lounge',
'observatory', 'old_computer', 'statue', 'terrace_2']
geo_mask_thres = {'botanical_garden':1, # 30 images, outdoor
'boulders':1, # 26 images, outdoor
'bridge':2, # 110 images, outdoor
'door':2, # 6 images, indoor
'exhibition_hall':2, # 68 images, indoor
'lecture_room':2, # 23 images, indoor
'living_room':2, # 65 images, indoor
'lounge':1,# 10 images, indoor
'observatory':2, # 27 images, outdoor
'old_computer':2, # 54 images, indoor
'statue':2, # 10 images, indoor
'terrace_2':2 # 13 images, outdoor
}
for scan in scans:
start_time = time.time()
scan_folder = os.path.join(args.testpath, scan)
out_folder = os.path.join(args.outdir, scan)
filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'),
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
print('scan: '+scan+' time = {:3f}'.format(time.time() - start_time))
elif args.split == "train":
scans = ['courtyard', 'delivery_area', 'electro', 'facade',
'kicker', 'meadow', 'office', 'pipes', 'playground',
'relief', 'relief_2', 'terrace', 'terrains']
geo_mask_thres = {'courtyard':1, # 38 images, outdoor
'delivery_area':2, # 44 images, indoor
'electro':1, # 45 images, outdoor
'facade':2, # 76 images, outdoor
'kicker':1, # 31 images, indoor
'meadow':1, # 15 images, outdoor
'office':1, # 26 images, indoor
'pipes':1,# 14 images, indoor
'playground':1, # 38 images, outdoor
'relief':1, # 31 images, indoor
'relief_2':1, # 31 images, indoor
'terrace':1, # 23 images, outdoor
'terrains':2 # 42 images, indoor
}
for scan in scans:
start_time = time.time()
scan_folder = os.path.join(args.testpath, scan)
out_folder = os.path.join(args.outdir, scan)
filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'),
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
print('scan: '+scan+' time = {:3f}'.format(time.time() - start_time))
else:
filter_depth(args.testpath, args.outdir, os.path.join(args.outdir, 'custom.ply'),
2023-03-20 20:05:10 +08:00
args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres=3)