Add files via upload

2023-03-20 19:52:04 +08:00 · 2023-03-20 19:52:04 +08:00 · e079f027ff
commit e079f027ff
parent 84baaf7d8e
29 changed files with 4062 additions and 0 deletions
--- a/IGEV-MVS/core/init.py
+++ b/IGEV-MVS/core/init.py
--- a/IGEV-MVS/core/corr.py
+++ b/IGEV-MVS/core/corr.py
@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .submodule import *
+
+class CorrBlock1D_Cost_Volume:
+    def __init__(self, init_corr, corr, num_levels=2, radius=4, inverse_depth_min=None, inverse_depth_max=None, num_sample=None):
+        self.num_levels = 2
+        self.radius = radius
+        self.inverse_depth_min = inverse_depth_min
+        self.inverse_depth_max = inverse_depth_max
+        self.num_sample = num_sample
+        self.corr_pyramid = []
+        self.init_corr_pyramid = []
+
+        # all pairs correlation
+
+        # batch, h1, w1, dim, w2 = corr.shape
+        b, c, d, h, w = corr.shape
+        corr = corr.permute(0, 3, 4, 1, 2).reshape(b*h*w, 1, 1, d)
+        init_corr = init_corr.permute(0, 3, 4, 1, 2).reshape(b*h*w, 1, 1, d)
+
+        self.corr_pyramid.append(corr)
+        self.init_corr_pyramid.append(init_corr)
+
+
+        for i in range(self.num_levels):
+            corr = F.avg_pool2d(corr, [1,2], stride=[1,2])
+            self.corr_pyramid.append(corr)
+
+        for i in range(self.num_levels):
+            init_corr = F.avg_pool2d(init_corr, [1,2], stride=[1,2])
+            self.init_corr_pyramid.append(init_corr)
+
+
+
+    def __call__(self, disp):
+        r = self.radius
+        b, _, h, w = disp.shape
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            init_corr = self.init_corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2*r+1)
+            dx = dx.view(1, 1, 2*r+1, 1).to(disp.device)
+            x0 = dx + disp.reshape(b*h*w, 1, 1, 1) / 2**i
+            y0 = torch.zeros_like(x0)
+
+            disp_lvl = torch.cat([x0,y0], dim=-1)
+            corr = bilinear_sampler(corr, disp_lvl)
+            corr = corr.view(b, h, w, -1)
+
+            init_corr = bilinear_sampler(init_corr, disp_lvl)
+            init_corr = init_corr.view(b, h, w, -1)
+
+            out_pyramid.append(corr)
+            out_pyramid.append(init_corr)
+
+
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
--- a/IGEV-MVS/core/extractor.py
+++ b/IGEV-MVS/core/extractor.py
@ -0,0 +1,212 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+import math
+from .submodule import *
+
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+  
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.BatchNorm2d(planes)
+        
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.Sequential()
+
+        if stride == 1 and in_planes == planes:
+            self.downsample = None
+        
+        else:    
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+
+
+    def forward(self, x):
+        y = x
+        y = self.conv1(y)
+        y = self.norm1(y)
+        y = self.relu(y)
+        y = self.conv2(y)
+        y = self.norm2(y)
+        y = self.relu(y)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x+y)
+
+class MultiBasicEncoder(nn.Module):
+    def __init__(self, output_dim=[128], norm_fn='batch', dropout=0.0, downsample=3):
+        super(MultiBasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        self.downsample = downsample
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1 + (downsample > 2), padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=1 + (downsample > 1))
+        self.layer3 = self._make_layer(128, stride=1 + (downsample > 0))
+        self.layer4 = self._make_layer(128, stride=2)
+        self.layer5 = self._make_layer(128, stride=2)
+
+        output_list = []
+        
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[2], 3, padding=1))
+            output_list.append(conv_out)
+
+        self.outputs04 = nn.ModuleList(output_list)
+
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[1], 3, padding=1))
+            output_list.append(conv_out)
+
+        self.outputs08 = nn.ModuleList(output_list)
+
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Conv2d(128, dim[0], 3, padding=1)
+            output_list.append(conv_out)
+
+        self.outputs16 = nn.ModuleList(output_list)
+
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        else:
+            self.dropout = None
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x, dual_inp=False, num_layers=3):
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        if dual_inp:
+            v = x
+            x = x[:(x.shape[0]//2)]
+
+        outputs04 = [f(x) for f in self.outputs04]
+        if num_layers == 1:
+            return (outputs04, v) if dual_inp else (outputs04,)
+
+        y = self.layer4(x)
+        outputs08 = [f(y) for f in self.outputs08]
+
+        if num_layers == 2:
+            return (outputs04, outputs08, v) if dual_inp else (outputs04, outputs08)
+
+        z = self.layer5(y)
+        outputs16 = [f(z) for f in self.outputs16]
+
+        return (outputs04, outputs08, outputs16, v) if dual_inp else (outputs04, outputs08, outputs16)
+
+class Feature(SubModule):
+    def __init__(self):
+        super(Feature, self).__init__()
+        pretrained =  True
+        model = timm.create_model('mobilenetv2_100', pretrained=pretrained, features_only=True)
+
+        layers = [1,2,3,5,6]
+        chans = [16, 24, 32, 96, 160]
+        self.conv_stem = model.conv_stem
+        self.bn1 = model.bn1
+
+        self.block0 = torch.nn.Sequential(*model.blocks[0:layers[0]])
+        self.block1 = torch.nn.Sequential(*model.blocks[layers[0]:layers[1]])
+        self.block2 = torch.nn.Sequential(*model.blocks[layers[1]:layers[2]])
+        self.block3 = torch.nn.Sequential(*model.blocks[layers[2]:layers[3]])
+        self.block4 = torch.nn.Sequential(*model.blocks[layers[3]:layers[4]])
+
+        self.deconv32_16 = Conv2x_IN(chans[4], chans[3], deconv=True, concat=True)
+        self.deconv16_8 = Conv2x_IN(chans[3]*2, chans[2], deconv=True, concat=True)
+        self.deconv8_4 = Conv2x_IN(chans[2]*2, chans[1], deconv=True, concat=True)
+        self.conv4 = BasicConv_IN(chans[1]*2, chans[1]*2, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        B, V, _, H, W = x.size()
+        x = x.view(B * V, -1, H, W)
+        #x = self.act1(self.bn1(self.conv_stem(x)))
+        x = self.bn1(self.conv_stem(x))
+        x2 = self.block0(x)
+        x4 = self.block1(x2)
+        # return x4,x4,x4,x4
+        x8 = self.block2(x4)
+        x16 = self.block3(x8)
+        x32 = self.block4(x16)
+
+        x16 = self.deconv32_16(x32, x16)
+        x8 = self.deconv16_8(x16, x8)
+        x4 = self.deconv8_4(x8, x4)
+        x4 = self.conv4(x4)
+
+        x4 = x4.view(B, V, -1, H // 4, W // 4)
+        x8 = x8.view(B, V, -1, H // 8, W // 8)
+        x16 = x16.view(B, V, -1, H // 16, W // 16)
+        x32 = x32.view(B, V, -1, H // 32, W // 32)
+        return [x4, x8, x16, x32]
--- a/IGEV-MVS/core/igev_mvs.py
+++ b/IGEV-MVS/core/igev_mvs.py
@ -0,0 +1,195 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .submodule import *
+from .corr import *
+from .extractor import *
+from .update import *
+
+try:
+    autocast = torch.cuda.amp.autocast
+except:
+    class autocast:
+        def __init__(self, enabled):
+            pass
+        def __enter__(self):
+            pass
+        def __exit__(self, *args):
+            pass
+
+class IGEVMVS(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        
+        context_dims = [128, 128, 128]
+        self.n_gru_layers = 3
+        self.slow_fast_gru = False
+        self.mixed_precision = True
+        self.num_sample = 64
+        self.G = 1
+        self.corr_radius = 4
+        self.corr_levels = 2
+        self.iters = args.iteration
+        self.update_block = BasicMultiUpdateBlock(hidden_dims=context_dims)
+        self.conv_hidden_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1, stride=1)
+        self.conv_hidden_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1, stride=2) 
+        self.conv_hidden_4 = nn.Conv2d(128, 128, kernel_size=3, padding=1, stride=2) 
+        self.feature = Feature()
+
+        self.stem_2 = nn.Sequential(
+            BasicConv_IN(3, 32, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(32, 32, 3, 1, 1, bias=False),
+            nn.InstanceNorm2d(32), nn.ReLU()
+            )
+        self.stem_4 = nn.Sequential(
+            BasicConv_IN(32, 48, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(48, 48, 3, 1, 1, bias=False),
+            nn.InstanceNorm2d(48), nn.ReLU()
+            )
+
+        self.conv = BasicConv_IN(96, 48, kernel_size=3, padding=1, stride=1)
+        self.desc = nn.Conv2d(48, 48, kernel_size=1, padding=0, stride=1)
+
+        self.spx = nn.Sequential(nn.ConvTranspose2d(2*32, 9, kernel_size=4, stride=2, padding=1),)
+        self.spx_2 = Conv2x_IN(32, 32, True)
+        self.spx_4 = nn.Sequential(
+            BasicConv_IN(96, 32, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(32, 32, 3, 1, 1, bias=False),
+            nn.InstanceNorm2d(32), nn.ReLU()
+            )
+
+        self.depth_initialization = DepthInitialization(self.num_sample)
+        self.pixel_view_weight = PixelViewWeight(self.G)
+
+        self.corr_stem = BasicConv(1, 8, is_3d=True, kernel_size=3, stride=1, padding=1)
+        self.corr_feature_att = FeatureAtt(8, 96)
+        self.cost_agg = hourglass(8)
+
+        self.spx_2_gru = Conv2x(32, 32, True)
+        self.spx_gru = nn.Sequential(nn.ConvTranspose2d(2*32, 9, kernel_size=4, stride=2, padding=1),)
+
+    def upsample_disp(self, depth, mask_feat_4, stem_2x):
+        with autocast(enabled=self.mixed_precision):
+            xspx = self.spx_2_gru(mask_feat_4, stem_2x)
+            spx_pred = self.spx_gru(xspx)
+            spx_pred = F.softmax(spx_pred, 1)
+
+            up_depth = context_upsample(depth, spx_pred).unsqueeze(1)
+
+        return up_depth
+
+    def forward(self, imgs, proj_matrices, depth_min, depth_max, test_mode=False):
+        proj_matrices_2 = torch.unbind(proj_matrices['level_2'].float(), 1)
+        depth_min = depth_min.float()
+        depth_max = depth_max.float()
+
+        ref_proj = proj_matrices_2[0]
+        src_projs = proj_matrices_2[1:]
+
+        with autocast(enabled=self.mixed_precision):
+            images = torch.unbind(imgs['level_0'], dim=1)
+            features = self.feature(imgs['level_0'])
+            ref_feature = []
+            for fea in features:
+                ref_feature.append(torch.unbind(fea, dim=1)[0])
+            src_features = [src_fea for src_fea in torch.unbind(features[0], dim=1)[1:]]
+
+            stem_2x = self.stem_2(images[0])
+            stem_4x = self.stem_4(stem_2x)
+            ref_feature[0] = torch.cat((ref_feature[0], stem_4x), 1)
+
+            for idx, src_fea in enumerate(src_features):
+                stem_2y = self.stem_2(images[idx + 1])
+                stem_4y = self.stem_4(stem_2y)
+                src_features[idx] = torch.cat((src_fea, stem_4y), 1)
+
+            match_left = self.desc(self.conv(ref_feature[0]))
+            match_left = match_left / torch.norm(match_left, 2, 1, True)
+
+            match_rights = [self.desc(self.conv(src_fea)) for src_fea in src_features]
+            match_rights = [match_right / torch.norm(match_right, 2, 1, True) for match_right in match_rights]
+
+            xspx = self.spx_4(ref_feature[0])
+            xspx = self.spx_2(xspx, stem_2x)
+            spx_pred = self.spx(xspx)
+            spx_pred = F.softmax(spx_pred, 1)
+
+            batch, dim, height, width = match_left.size()
+            inverse_depth_min = (1.0 / depth_min).view(batch, 1, 1, 1)
+            inverse_depth_max = (1.0 / depth_max).view(batch, 1, 1, 1)
+
+            device = match_left.device
+            correlation_sum = 0
+            view_weight_sum = 1e-5
+
+        match_left = match_left.float()
+        depth_samples = self.depth_initialization(inverse_depth_min, inverse_depth_max, height, width, device)
+        for src_feature, src_proj in zip(match_rights, src_projs):
+            src_feature = src_feature.float()
+            warped_feature = differentiable_warping(src_feature, src_proj, ref_proj, depth_samples)
+            warped_feature = warped_feature.view(batch, self.G, dim // self.G, self.num_sample, height, width)
+            correlation = torch.mean(warped_feature * match_left.view(batch, self.G, dim // self.G, 1, height, width), dim=2, keepdim=False)
+
+            view_weight = self.pixel_view_weight(correlation)
+            del warped_feature, src_feature, src_proj
+
+            correlation_sum += correlation * view_weight.unsqueeze(1)
+            view_weight_sum += view_weight_sum + view_weight.unsqueeze(1) 
+            del correlation, view_weight
+        del match_left, match_rights, src_projs
+                
+        with autocast(enabled=self.mixed_precision):
+            init_corr_volume = correlation_sum.div_(view_weight_sum)
+            corr_volume = self.corr_stem(init_corr_volume)
+            corr_volume = self.corr_feature_att(corr_volume, ref_feature[0])
+            regularized_cost_volume = self.cost_agg(corr_volume, ref_feature)
+
+            GEV_hidden = self.conv_hidden_1(regularized_cost_volume.squeeze(1))
+
+            GEV_hidden_2 = self.conv_hidden_2(GEV_hidden)
+
+            GEV_hidden_4 = self.conv_hidden_4(GEV_hidden_2)
+
+            net_list = [GEV_hidden, GEV_hidden_2, GEV_hidden_4]
+
+            net_list = [torch.tanh(x) for x in net_list]
+
+        corr_block = CorrBlock1D_Cost_Volume
+
+        init_corr_volume = init_corr_volume.float()
+        regularized_cost_volume = regularized_cost_volume.float()
+        probability = F.softmax(regularized_cost_volume.squeeze(1), dim=1)
+        index = torch.arange(0, self.num_sample, 1, device=probability.device).view(1, self.num_sample, 1, 1).float()
+        disp_init = torch.sum(index * probability, dim = 1, keepdim=True)
+
+        corr_fn = corr_block(init_corr_volume, regularized_cost_volume, radius=self.corr_radius, num_levels=self.corr_levels, inverse_depth_min=inverse_depth_min, inverse_depth_max=inverse_depth_max, num_sample=self.num_sample)
+
+        disp_predictions = []
+        disp = disp_init
+
+        for itr in range(self.iters):
+            disp = disp.detach()
+            corr = corr_fn(disp)
+
+            with autocast(enabled=self.mixed_precision):
+                if self.n_gru_layers == 3 and self.slow_fast_gru: # Update low-res GRU
+                    net_list = self.update_block(net_list, iter16=True, iter08=False, iter04=False, update=False)
+                if self.n_gru_layers >= 2 and self.slow_fast_gru:# Update low-res GRU and mid-res GRU
+                    net_list = self.update_block(net_list, iter16=self.n_gru_layers==3, iter08=True, iter04=False, update=False)
+                net_list, mask_feat_4, delta_disp = self.update_block(net_list, corr, disp, iter16=self.n_gru_layers==3, iter08=self.n_gru_layers>=2)
+
+            disp = disp + delta_disp
+
+            if test_mode and itr < self.iters-1:
+                continue
+
+            disp_up = self.upsample_disp(disp, mask_feat_4, stem_2x)  / (self.num_sample-1)
+            disp_predictions.append(disp_up)
+
+        disp_init = context_upsample(disp_init, spx_pred.float()).unsqueeze(1)  / (self.num_sample-1)
+
+        if test_mode:
+            return disp_up
+ 
+
+        return disp_init, disp_predictions
--- a/IGEV-MVS/core/submodule.py
+++ b/IGEV-MVS/core/submodule.py
@ -0,0 +1,396 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+class SubModule(nn.Module):
+    def __init__(self):
+        super(SubModule, self).__init__()
+
+    def weight_init(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.Conv3d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+class BasicConv(nn.Module):
+
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, bn=True, relu=True, **kwargs):
+        super(BasicConv, self).__init__()
+
+        self.relu = relu
+        self.use_bn = bn
+        if is_3d:
+            if deconv:
+                self.conv = nn.ConvTranspose3d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv3d(in_channels, out_channels, bias=False, **kwargs)
+            self.bn = nn.BatchNorm3d(out_channels)
+        else:
+            if deconv:
+                self.conv = nn.ConvTranspose2d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+            self.bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.relu:
+            x = nn.LeakyReLU()(x)#, inplace=True)
+        return x
+
+class BasicConv_IN(nn.Module):
+
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, IN=True, relu=True, **kwargs):
+        super(BasicConv_IN, self).__init__()
+
+        self.relu = relu
+        self.use_in = IN
+        if is_3d:
+            if deconv:
+                self.conv = nn.ConvTranspose3d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv3d(in_channels, out_channels, bias=False, **kwargs)
+            self.IN = nn.InstanceNorm3d(out_channels)
+        else:
+            if deconv:
+                self.conv = nn.ConvTranspose2d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+            self.IN = nn.InstanceNorm2d(out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_in:
+            x = self.IN(x)
+        if self.relu:
+            x = nn.LeakyReLU()(x)#, inplace=True)
+        return x
+
+class Conv2x(nn.Module):
+
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, concat=True, keep_concat=True, bn=True, relu=True, keep_dispc=False):
+        super(Conv2x, self).__init__()
+        self.concat = concat
+        self.is_3d = is_3d 
+        if deconv and is_3d: 
+            kernel = (4, 4, 4)
+        elif deconv:
+            kernel = 4
+        else:
+            kernel = 3
+
+        if deconv and is_3d and keep_dispc:
+            kernel = (1, 4, 4)
+            stride = (1, 2, 2)
+            padding = (0, 1, 1)
+            self.conv1 = BasicConv(in_channels, out_channels, deconv, is_3d, bn=True, relu=True, kernel_size=kernel, stride=stride, padding=padding)
+        else:
+            self.conv1 = BasicConv(in_channels, out_channels, deconv, is_3d, bn=True, relu=True, kernel_size=kernel, stride=2, padding=1)
+
+        if self.concat: 
+            mul = 2 if keep_concat else 1
+            self.conv2 = BasicConv(out_channels*2, out_channels*mul, False, is_3d, bn, relu, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv2 = BasicConv(out_channels, out_channels, False, is_3d, bn, relu, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, rem):
+        x = self.conv1(x)
+        if x.shape != rem.shape:
+            x = F.interpolate(
+                x,
+                size=(rem.shape[-2], rem.shape[-1]),
+                mode='nearest')
+        if self.concat:
+            x = torch.cat((x, rem), 1)
+        else: 
+            x = x + rem
+        x = self.conv2(x)
+        return x
+
+class Conv2x_IN(nn.Module):
+
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, concat=True, keep_concat=True, IN=True, relu=True, keep_dispc=False):
+        super(Conv2x_IN, self).__init__()
+        self.concat = concat
+        self.is_3d = is_3d 
+        if deconv and is_3d: 
+            kernel = (4, 4, 4)
+        elif deconv:
+            kernel = 4
+        else:
+            kernel = 3
+
+        if deconv and is_3d and keep_dispc:
+            kernel = (1, 4, 4)
+            stride = (1, 2, 2)
+            padding = (0, 1, 1)
+            self.conv1 = BasicConv_IN(in_channels, out_channels, deconv, is_3d, IN=True, relu=True, kernel_size=kernel, stride=stride, padding=padding)
+        else:
+            self.conv1 = BasicConv_IN(in_channels, out_channels, deconv, is_3d, IN=True, relu=True, kernel_size=kernel, stride=2, padding=1)
+
+        if self.concat: 
+            mul = 2 if keep_concat else 1
+            self.conv2 = BasicConv_IN(out_channels*2, out_channels*mul, False, is_3d, IN, relu, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv2 = BasicConv_IN(out_channels, out_channels, False, is_3d, IN, relu, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, rem):
+        x = self.conv1(x)
+        if x.shape != rem.shape:
+            x = F.interpolate(
+                x,
+                size=(rem.shape[-2], rem.shape[-1]),
+                mode='nearest')
+        if self.concat:
+            x = torch.cat((x, rem), 1)
+        else: 
+            x = x + rem
+        x = self.conv2(x)
+        return x
+
+class ConvReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, pad=1, dilation=1):
+        super(ConvReLU, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, dilation=dilation, bias=False)
+
+    def forward(self,x):
+        return F.relu(self.conv(x), inplace=True)
+
+class DepthInitialization(nn.Module):
+    def __init__(self, num_sample):
+        super(DepthInitialization, self).__init__()
+        self.num_sample = num_sample
+    
+    def forward(self, inverse_depth_min, inverse_depth_max, height, width, device):
+        batch = inverse_depth_min.size()[0]      
+        index = torch.arange(0, self.num_sample, 1, device=device).view(1, self.num_sample, 1, 1).float()
+        normalized_sample = index.repeat(batch, 1, height, width) / (self.num_sample-1)
+        depth_sample = inverse_depth_max + normalized_sample * (inverse_depth_min - inverse_depth_max)
+
+        depth_sample = 1.0 / depth_sample
+
+        return depth_sample
+
+class PixelViewWeight(nn.Module):
+    def __init__(self, G):
+        super(PixelViewWeight, self).__init__()
+        self.conv = nn.Sequential(
+            ConvReLU(G, 16),
+            nn.Conv2d(16, 1, 1, stride=1, padding=0),
+        )
+
+    def forward(self, x):
+        # x: [B, G, N, H, W]
+        batch, dim, num_depth, height, width = x.size()
+        x = x.permute(0, 2, 1, 3, 4).contiguous()
+        x = x.view(batch*num_depth, dim, height, width) # [B*N,G,H,W]
+        x =self.conv(x).view(batch, num_depth, height, width)
+        x = torch.softmax(x,dim=1)
+        x = torch.max(x, dim=1)[0]
+
+        return x.unsqueeze(1)
+
+class FeatureAtt(nn.Module):
+    def __init__(self, cv_chan, feat_chan):
+        super(FeatureAtt, self).__init__()
+
+        self.feat_att = nn.Sequential(
+            BasicConv(feat_chan, feat_chan//2, kernel_size=1, stride=1, padding=0),
+            nn.Conv2d(feat_chan//2, cv_chan, 1))
+
+    def forward(self, cv, feat):
+        '''
+        '''
+        feat_att = self.feat_att(feat).unsqueeze(2)
+        cv = torch.sigmoid(feat_att)*cv
+        return cv
+
+class hourglass(nn.Module):
+    def __init__(self, in_channels):
+        super(hourglass, self).__init__()
+
+        self.conv1 = nn.Sequential(BasicConv(in_channels, in_channels*2, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   BasicConv(in_channels*2, in_channels*2, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=1, dilation=1))
+                                    
+        self.conv2 = nn.Sequential(BasicConv(in_channels*2, in_channels*4, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   BasicConv(in_channels*4, in_channels*4, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=1, dilation=1))                             
+
+        self.conv3 = nn.Sequential(BasicConv(in_channels*4, in_channels*6, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   BasicConv(in_channels*6, in_channels*6, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=1, dilation=1)) 
+
+
+        self.conv3_up = BasicConv(in_channels*6, in_channels*4, deconv=True, is_3d=True, bn=True,
+                                  relu=True, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+
+        self.conv2_up = BasicConv(in_channels*4, in_channels*2, deconv=True, is_3d=True, bn=True,
+                                  relu=True, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+
+        self.conv1_up = BasicConv(in_channels*2, 1, deconv=True, is_3d=True, bn=False,
+                                  relu=False, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+
+        self.agg_0 = nn.Sequential(BasicConv(in_channels*8, in_channels*4, is_3d=True, kernel_size=1, padding=0, stride=1),
+                                   BasicConv(in_channels*4, in_channels*4, is_3d=True, kernel_size=3, padding=1, stride=1),
+                                   BasicConv(in_channels*4, in_channels*4, is_3d=True, kernel_size=3, padding=1, stride=1),)
+
+        self.agg_1 = nn.Sequential(BasicConv(in_channels*4, in_channels*2, is_3d=True, kernel_size=1, padding=0, stride=1),
+                                   BasicConv(in_channels*2, in_channels*2, is_3d=True, kernel_size=3, padding=1, stride=1),
+                                   BasicConv(in_channels*2, in_channels*2, is_3d=True, kernel_size=3, padding=1, stride=1))
+
+
+        self.feature_att_8 = FeatureAtt(in_channels*2, 64)
+        self.feature_att_16 = FeatureAtt(in_channels*4, 192)
+        self.feature_att_32 = FeatureAtt(in_channels*6, 160)
+        self.feature_att_up_16 = FeatureAtt(in_channels*4, 192)
+        self.feature_att_up_8 = FeatureAtt(in_channels*2, 64)
+
+    def forward(self, x, features):
+        conv1 = self.conv1(x)
+        conv1 = self.feature_att_8(conv1, features[1])
+
+        conv2 = self.conv2(conv1)
+        conv2 = self.feature_att_16(conv2, features[2])
+
+        conv3 = self.conv3(conv2)
+        conv3 = self.feature_att_32(conv3, features[3])
+
+        conv3_up = self.conv3_up(conv3)
+
+        conv2 = torch.cat((conv3_up, conv2), dim=1)
+        conv2 = self.agg_0(conv2)
+        conv2 = self.feature_att_up_16(conv2, features[2])
+
+        conv2_up = self.conv2_up(conv2)
+
+        conv1 = torch.cat((conv2_up, conv1), dim=1)
+        conv1 = self.agg_1(conv1)
+        conv1 = self.feature_att_up_8(conv1, features[1])
+
+        conv = self.conv1_up(conv1)
+
+        return conv
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+
+    assert torch.unique(ygrid).numel() == 1 and H == 1 # This is a stereo problem
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+def context_upsample(disp_low, up_weights):
+    ###
+    # cv (b,1,h,w)
+    # sp (b,9,4*h,4*w)
+    ###
+    b, c, h, w = disp_low.shape
+        
+    disp_unfold = F.unfold(disp_low.reshape(b,c,h,w),3,1,1).reshape(b,-1,h,w)
+    disp_unfold = F.interpolate(disp_unfold,(h*4,w*4),mode='nearest').reshape(b,9,h*4,w*4)
+
+    disp = (disp_unfold*up_weights).sum(1)
+        
+    return disp
+
+def pool2x(x):
+    return F.avg_pool2d(x, 3, stride=2, padding=1)
+
+def interp(x, dest):
+    interp_args = {'mode': 'bilinear', 'align_corners': True}
+    return F.interpolate(x, dest.shape[2:], **interp_args)
+
+def differentiable_warping(src_fea, src_proj, ref_proj, depth_samples, return_mask=False):
+    # src_fea: [B, C, H, W]
+    # src_proj: [B, 4, 4]
+    # ref_proj: [B, 4, 4]
+    # depth_samples: [B, Ndepth, H, W] 
+    # out: [B, C, Ndepth, H, W]
+    batch, num_depth, height, width = depth_samples.size()
+    height1, width1 = src_fea.size()[2:]
+
+    with torch.no_grad():
+        if batch==2:
+            inv_ref_proj = []
+            for i in range(batch):
+                inv_ref_proj.append(torch.inverse(ref_proj[i]).unsqueeze(0))
+            inv_ref_proj = torch.cat(inv_ref_proj, dim=0)
+            assert (not torch.isnan(inv_ref_proj).any()), "nan in inverse(ref_proj)"
+            proj = torch.matmul(src_proj, inv_ref_proj)
+        else:
+            proj = torch.matmul(src_proj, torch.inverse(ref_proj))
+            assert (not torch.isnan(proj).any()), "nan in proj"
+
+        rot = proj[:, :3, :3]  # [B,3,3]
+        trans = proj[:, :3, 3:4]  # [B,3,1]
+        y, x = torch.meshgrid([torch.arange(0, height, dtype=torch.float32, device=depth_samples.device),
+                            torch.arange(0, width, dtype=torch.float32, device=depth_samples.device)])
+        y, x = y.contiguous(), x.contiguous()
+        y, x = y.view(height * width), x.view(height * width)
+        y = y*(height1/height)
+        x = x*(width1/width)
+        xyz = torch.stack((x, y, torch.ones_like(x)))  # [3, H*W]
+        xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1)  # [B, 3, H*W]
+        rot_xyz = torch.matmul(rot, xyz)  # [B, 3, H*W]
+
+        rot_depth_xyz = rot_xyz.unsqueeze(2).repeat(1, 1, num_depth, 1) * depth_samples.view(batch, 1, num_depth,
+                                                                                            height * width)  # [B, 3, Ndepth, H*W]
+        proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1, 1)  # [B, 3, Ndepth, H*W]
+        # avoid negative depth
+        valid_mask = proj_xyz[:, 2:] > 1e-2
+        proj_xyz[:, 0:1][~valid_mask] = width
+        proj_xyz[:, 1:2][~valid_mask] = height
+        proj_xyz[:, 2:3][~valid_mask] = 1
+        proj_xy = proj_xyz[:, :2, :, :] / proj_xyz[:, 2:3, :, :]  # [B, 2, Ndepth, H*W]
+        valid_mask = valid_mask & (proj_xy[:, 0:1] >=0) & (proj_xy[:, 0:1] < width) \
+                    & (proj_xy[:, 1:2] >=0) & (proj_xy[:, 1:2] < height)
+        proj_x_normalized = proj_xy[:, 0, :, :] / ((width1 - 1) / 2) - 1 # [B, Ndepth, H*W]
+        proj_y_normalized = proj_xy[:, 1, :, :] / ((height1 - 1) / 2) - 1
+        proj_xy = torch.stack((proj_x_normalized, proj_y_normalized), dim=3)  # [B, Ndepth, H*W, 2]
+        grid = proj_xy      
+
+    dim = src_fea.size()[1]
+    warped_src_fea = F.grid_sample(src_fea, grid.view(batch, num_depth * height, width, 2), mode='bilinear',
+                                padding_mode='zeros',align_corners=True)
+    warped_src_fea = warped_src_fea.view(batch, dim, num_depth, height, width)
+    if return_mask:
+        valid_mask = valid_mask.view(batch,num_depth,height,width)
+        return warped_src_fea, valid_mask
+    else:
+        return warped_src_fea
+
+def depth_normalization(depth, inverse_depth_min, inverse_depth_max):
+    '''convert depth map to the index in inverse range'''
+    inverse_depth = 1.0 / (depth+1e-5)
+    normalized_depth = (inverse_depth - inverse_depth_max) / (inverse_depth_min - inverse_depth_max)
+    return normalized_depth
+
+def depth_unnormalization(normalized_depth, inverse_depth_min, inverse_depth_max):
+    '''convert the index in inverse range to depth map'''
+    inverse_depth = inverse_depth_max + normalized_depth * (inverse_depth_min - inverse_depth_max) # [B,1,H,W]
+    depth = 1.0 / inverse_depth
+    return depth
--- a/IGEV-MVS/core/update.py
+++ b/IGEV-MVS/core/update.py
@ -0,0 +1,94 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .submodule import *
+
+class BasicMotionEncoder(nn.Module):
+    def __init__(self):
+        super(BasicMotionEncoder, self).__init__()
+        self.corr_levels = 2
+        self.corr_radius = 4
+
+        cor_planes =  2 * self.corr_levels * (2*self.corr_radius + 1)
+
+        self.convc1 = nn.Conv2d(cor_planes, 64, 1, padding=0)
+        self.convc2 = nn.Conv2d(64, 64, 3, padding=1)
+        self.convd1 = nn.Conv2d(1, 64, 7, padding=3)
+        self.convd2 = nn.Conv2d(64, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64+64, 128-1, 3, padding=1)
+
+    def forward(self, disp, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        disp_ = F.relu(self.convd1(disp))
+        disp_ = F.relu(self.convd2(disp_))
+
+        cor_disp = torch.cat([cor, disp_], dim=1)
+        out = F.relu(self.conv(cor_disp))
+        return torch.cat([out, disp], dim=1)
+
+class ConvGRU(nn.Module):
+    def __init__(self, hidden_dim, input_dim, kernel_size=3):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+
+    def forward(self, h, *x_list):
+        x = torch.cat(x_list, dim=1)
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        return h
+
+class DispHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256, output_dim=1):
+        super(DispHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, output_dim, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+class BasicMultiUpdateBlock(nn.Module):
+    def __init__(self, hidden_dims=[]):
+        super().__init__()
+        self.n_gru_layers = 3
+        self.n_downsample = 2
+        self.encoder = BasicMotionEncoder()
+        encoder_output_dim = 128
+
+        self.gru04 = ConvGRU(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (self.n_gru_layers > 1))
+        self.gru08 = ConvGRU(hidden_dims[1], hidden_dims[0] * (self.n_gru_layers == 3) + hidden_dims[2])
+        self.gru16 = ConvGRU(hidden_dims[0], hidden_dims[1])
+        self.disp_head = DispHead(hidden_dims[2], hidden_dim=256, output_dim=1)
+        factor = 2**self.n_downsample
+
+        self.mask_feat_4 = nn.Sequential(
+            nn.Conv2d(hidden_dims[2], 32, 3, padding=1),
+            nn.ReLU(inplace=True))
+
+    def forward(self, net, corr=None, disp=None, iter04=True, iter08=True, iter16=True, update=True):
+        if iter16:
+            net[2] = self.gru16(net[2], pool2x(net[1]))
+        if iter08:
+            if self.n_gru_layers > 2:
+                net[1] = self.gru08(net[1], pool2x(net[0]), interp(net[2], net[1]))
+            else:
+                net[1] = self.gru08(net[1], pool2x(net[0]))
+        if iter04:
+            motion_features = self.encoder(disp, corr)
+            if self.n_gru_layers > 1:
+                net[0] = self.gru04(net[0], motion_features, interp(net[1], net[0]))
+            else:
+                net[0] = self.gru04(net[0], motion_features)
+
+        if not update:
+            return net
+
+        delta_disp = self.disp_head(net[0])
+        mask_feat_4 = self.mask_feat_4(net[0])
+        return net, mask_feat_4, delta_disp
--- a/IGEV-MVS/datasets/init.py
+++ b/IGEV-MVS/datasets/init.py
@ -0,0 +1,8 @@
+import importlib
+
+
+# find the dataset definition by name, for example dtu_yao (dtu_yao.py)
+def find_dataset_def(dataset_name):
+    module_name = 'datasets.{}'.format(dataset_name)
+    module = importlib.import_module(module_name)
+    return getattr(module, "MVSDataset")
--- a/IGEV-MVS/datasets/blendedmvs.py
+++ b/IGEV-MVS/datasets/blendedmvs.py
@ -0,0 +1,208 @@
+from torch.utils.data import Dataset
+from datasets.data_io import *
+import os
+import numpy as np
+import cv2
+from PIL import Image
+from torchvision import transforms as T
+import random
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, listfile, split, nviews, img_wh=(768, 576), robust_train=True):
+        
+        super(MVSDataset, self).__init__()
+        self.levels = 4 
+        self.datapath = datapath
+        self.split = split
+        self.listfile = listfile
+        self.robust_train = robust_train
+        assert self.split in ['train', 'val', 'all'], \
+            'split must be either "train", "val" or "all"!'
+
+        self.img_wh = img_wh
+        if img_wh is not None:
+            assert img_wh[0]%32==0 and img_wh[1]%32==0, \
+                'img_wh must both be multiples of 32!'
+        self.nviews = nviews
+        self.scale_factors = {} # depth scale factors for each scan
+        self.build_metas()
+
+        self.color_augment = T.ColorJitter(brightness=0.5, contrast=0.5)
+
+    def build_metas(self):
+        self.metas = []
+        with open(self.listfile) as f:
+            self.scans = [line.rstrip() for line in f.readlines()]
+        for scan in self.scans:
+            with open(os.path.join(self.datapath, scan, "cams/pair.txt")) as f:
+                num_viewpoint = int(f.readline())
+                for _ in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                    if len(src_views) >= self.nviews-1:
+                        self.metas += [(scan, ref_view, src_views)]
+
+    def read_cam_file(self, scan, filename):
+        with open(filename) as f:
+            lines = f.readlines()
+            lines = [line.rstrip() for line in lines]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+        depth_min = float(lines[11].split()[0])
+        depth_max = float(lines[11].split()[-1])
+        if scan not in self.scale_factors:
+            self.scale_factors[scan] = 100.0/depth_min
+        depth_min *= self.scale_factors[scan]
+        depth_max *= self.scale_factors[scan]
+        extrinsics[:3, 3] *= self.scale_factors[scan]
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    def read_depth_mask(self, scan, filename, depth_min, depth_max, scale):
+        depth = np.array(read_pfm(filename)[0], dtype=np.float32)
+        depth = depth * self.scale_factors[scan] * scale
+        depth = np.squeeze(depth,2)
+
+        mask = (depth>=depth_min) & (depth<=depth_max)
+        mask = mask.astype(np.float32)
+        if self.img_wh is not None:
+            depth = cv2.resize(depth, self.img_wh,
+                                 interpolation=cv2.INTER_NEAREST)
+        h, w = depth.shape
+        depth_ms = {}
+        mask_ms = {}
+
+        for i in range(4):
+            depth_cur = cv2.resize(depth, (w//(2**i), h//(2**i)), interpolation=cv2.INTER_NEAREST)
+            mask_cur = cv2.resize(mask, (w//(2**i), h//(2**i)), interpolation=cv2.INTER_NEAREST)
+
+            depth_ms[f"level_{i}"] = depth_cur
+            mask_ms[f"level_{i}"] = mask_cur
+
+        return depth_ms, mask_ms
+
+    def read_img(self, filename):
+        img = Image.open(filename)
+        if self.split=='train':
+            img = self.color_augment(img)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        if self.img_wh is not None:
+            np_img = cv2.resize(np_img, self.img_wh,
+                                 interpolation=cv2.INTER_LINEAR)
+        h, w, _ = np_img.shape
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR), 
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms
+
+    def __len__(self):
+        return len(self.metas)
+
+    def __getitem__(self, idx):
+        meta = self.metas[idx]
+        scan, ref_view, src_views = meta
+        
+        if self.robust_train:
+            num_src_views = len(src_views)
+            index = random.sample(range(num_src_views), self.nviews - 1)
+            view_ids = [ref_view] + [src_views[i] for i in index]
+            scale = random.uniform(0.8, 1.25)
+
+        else:
+            view_ids = [ref_view] + src_views[:self.nviews - 1]
+            scale = 1
+
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+        mask = None
+        depth = None
+        depth_min = None
+        depth_max = None
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath, '{}/blended_images/{:0>8}.jpg'.format(scan, vid))
+            depth_filename = os.path.join(self.datapath, '{}/rendered_depth_maps/{:0>8}.pfm'.format(scan, vid))
+            proj_mat_filename = os.path.join(self.datapath, '{}/cams/{:0>8}_cam.txt'.format(scan, vid))
+
+            imgs = self.read_img(img_filename)
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            # here, the intrinsics from file is already adjusted to the downsampled size of feature 1/4H0 * 1/4W0
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(scan, proj_mat_filename)
+            extrinsics[:3, 3] *= scale
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)            
+
+
+            if i == 0:  # reference view
+                depth_min = depth_min_ * scale
+                depth_max = depth_max_ * scale
+                depth, mask = self.read_depth_mask(scan, depth_filename, depth_min, depth_max, scale)
+                for l in range(self.levels):
+                    mask[f'level_{l}'] = np.expand_dims(mask[f'level_{l}'],2)
+                    mask[f'level_{l}'] = mask[f'level_{l}'].transpose([2,0,1])
+                    depth[f'level_{l}'] = np.expand_dims(depth[f'level_{l}'],2)
+                    depth[f'level_{l}'] = depth[f'level_{l}'].transpose([2,0,1])
+                
+        # imgs: N*3*H0*W0, N is number of images
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+        # data is numpy array
+        return {"imgs": imgs,                   # [N, 3, H, W]
+                "proj_matrices": proj,          # [N,4,4]
+                "depth": depth,                 # [1, H, W]
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,         # scalar
+                "mask": mask}                   # [1, H, W]
+        
--- a/IGEV-MVS/datasets/custom.py
+++ b/IGEV-MVS/datasets/custom.py
@ -0,0 +1,145 @@
+from torch.utils.data import Dataset
+from datasets.data_io import *
+import os
+import numpy as np
+import cv2
+from PIL import Image
+from torchvision import transforms as T
+import math
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, n_views=5, img_wh=(640,480)):
+        self.levels = 4
+        self.datapath = datapath
+        self.img_wh = img_wh
+        self.build_metas()
+        self.n_views = n_views
+
+    def build_metas(self):
+        self.metas = []
+        with open(os.path.join(self.datapath, 'pair.txt')) as f:
+            num_viewpoint = int(f.readline())
+            for view_idx in range(num_viewpoint):
+                ref_view = int(f.readline().rstrip())
+                src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                if len(src_views) != 0:
+                    self.metas += [(ref_view, src_views)]
+                    
+
+    def read_cam_file(self, filename):
+        with open(filename) as f:
+            lines = [line.rstrip() for line in f.readlines()]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ')
+        extrinsics = extrinsics.reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ')
+        intrinsics = intrinsics.reshape((3, 3))
+
+        depth_min = float(lines[11].split()[0])
+        depth_max = float(lines[11].split()[-1])
+
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    def read_img(self, filename, h, w):
+        img = Image.open(filename)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        original_h, original_w, _ = np_img.shape
+        np_img = cv2.resize(np_img, self.img_wh, interpolation=cv2.INTER_LINEAR)
+        
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR),
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms, original_h, original_w
+
+    def __len__(self):
+        return len(self.metas)
+
+    def __getitem__(self, idx):
+        ref_view, src_views = self.metas[idx]
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.n_views-1]
+
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+
+        # depth = None
+        depth_min = None
+        depth_max = None
+
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath, f'images/{vid:08d}.jpg')
+            proj_mat_filename = os.path.join(self.datapath, f'cams_1/{vid:08d}_cam.txt')
+
+            imgs, original_h, original_w = self.read_img(img_filename,self.img_wh[1], self.img_wh[0])
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(proj_mat_filename)
+            intrinsics[0] *= self.img_wh[0]/original_w
+            intrinsics[1] *= self.img_wh[1]/original_h
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)
+
+            if i == 0:  # reference view
+                depth_min = depth_min_
+                depth_max = depth_max_
+
+        # imgs: N*3*H0*W0, N is number of images
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+        return {"imgs": imgs,                   # N*3*H0*W0
+                "proj_matrices": proj, # N*4*4
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,
+                "filename": '{}/' + '{:0>8}'.format(view_ids[0]) + "{}"
+                }  
--- a/IGEV-MVS/datasets/data_io.py
+++ b/IGEV-MVS/datasets/data_io.py
@ -0,0 +1,73 @@
+import numpy as np
+import re
+import sys
+
+
+def read_pfm(filename):
+    # rb: binary file and read only
+    file = open(filename, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().decode('utf-8').rstrip()
+    if header == 'PF':
+        color = True
+    elif header == 'Pf':  # depth is Pf
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8')) # re is used for matching
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width, 1)
+    # depth: H*W
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    file.close()
+    return data, scale
+
+
+def save_pfm(filename, image, scale=1):
+    file = open(filename, "wb")
+    color = None
+
+    image = np.flipud(image)
+    # print(image.shape)
+
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+    file.write('PF\n'.encode('utf-8') if color else 'Pf\n'.encode('utf-8'))
+    file.write('{} {}\n'.format(image.shape[1], image.shape[0]).encode('utf-8'))
+
+    endian = image.dtype.byteorder
+
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+
+    file.write(('%f\n' % scale).encode('utf-8'))
+
+    image.tofile(file)
+    file.close()
--- a/IGEV-MVS/datasets/dtu_yao.py
+++ b/IGEV-MVS/datasets/dtu_yao.py
@ -0,0 +1,236 @@
+from torch.utils.data import Dataset
+import numpy as np
+import os
+from PIL import Image
+from datasets.data_io import *
+import cv2
+import random
+from torchvision import transforms
+
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, listfile, mode, nviews, robust_train = False):
+        super(MVSDataset, self).__init__()
+
+        self.levels = 4
+        self.datapath = datapath
+        self.listfile = listfile
+        self.mode = mode
+        self.nviews = nviews
+        self.img_wh = (640, 512)
+        # self.img_wh = (1440, 1056)
+        self.robust_train = robust_train
+        
+
+        assert self.mode in ["train", "val", "test"]
+        self.metas = self.build_list()
+        self.color_augment = transforms.ColorJitter(brightness=0.5, contrast=0.5)
+
+    def build_list(self):
+        metas = []
+        with open(self.listfile) as f:
+            scans = f.readlines()
+            scans = [line.rstrip() for line in scans]
+
+        for scan in scans:
+            pair_file = "Cameras_1/pair.txt"
+            
+            with open(os.path.join(self.datapath, pair_file)) as f:
+                self.num_viewpoint = int(f.readline())
+                # viewpoints (49)
+                for view_idx in range(self.num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                    # light conditions 0-6
+                    for light_idx in range(7):
+                        metas.append((scan, light_idx, ref_view, src_views))
+        print("dataset", self.mode, "metas:", len(metas))
+        return metas
+
+    def __len__(self):
+        return len(self.metas)
+
+    def read_cam_file(self, filename):
+        with open(filename) as f:
+            lines = f.readlines()
+            lines = [line.rstrip() for line in lines]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+        depth_min = float(lines[11].split()[0])
+        depth_max = float(lines[11].split()[-1])
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    def read_img(self, filename):
+        img = Image.open(filename)
+        if self.mode=='train':
+            img = self.color_augment(img)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        h, w, _ = np_img.shape
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR), 
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms
+
+
+    def prepare_img(self, hr_img):
+        #downsample
+        h, w = hr_img.shape
+        # original w,h: 1600, 1200; downsample -> 800, 600 ; crop -> 640, 512
+        hr_img = cv2.resize(hr_img, (w//2, h//2), interpolation=cv2.INTER_NEAREST)
+        #crop
+        h, w = hr_img.shape
+        target_h, target_w = self.img_wh[1], self.img_wh[0]
+        start_h, start_w = (h - target_h)//2, (w - target_w)//2
+        hr_img_crop = hr_img[start_h: start_h + target_h, start_w: start_w + target_w]
+
+        return hr_img_crop
+
+    def read_mask(self, filename):
+        img = Image.open(filename)
+        np_img = np.array(img, dtype=np.float32)
+        np_img = (np_img > 10).astype(np.float32)
+        return np_img
+
+
+    def read_depth_mask(self, filename, mask_filename, scale):
+        depth_hr = np.array(read_pfm(filename)[0], dtype=np.float32) * scale
+        depth_hr = np.squeeze(depth_hr,2)
+        depth_lr = self.prepare_img(depth_hr)
+        mask = self.read_mask(mask_filename)
+        mask = self.prepare_img(mask)
+        mask = mask.astype(np.bool_)
+        mask = mask.astype(np.float32)
+        
+        h, w = depth_lr.shape
+        depth_lr_ms = {}
+        mask_ms = {}
+
+        for i in range(self.levels):
+            depth_cur = cv2.resize(depth_lr, (w//(2**i), h//(2**i)), interpolation=cv2.INTER_NEAREST)
+            mask_cur = cv2.resize(mask, (w//(2**i), h//(2**i)), interpolation=cv2.INTER_NEAREST)
+            depth_lr_ms[f"level_{i}"] = depth_cur
+            mask_ms[f"level_{i}"] = mask_cur
+
+        return depth_lr_ms, mask_ms
+
+
+    def __getitem__(self, idx):
+        meta = self.metas[idx]
+        scan, light_idx, ref_view, src_views = meta
+        # robust training strategy
+        if self.robust_train:
+            num_src_views = len(src_views)
+            index = random.sample(range(num_src_views), self.nviews - 1)
+            view_ids = [ref_view] + [src_views[i] for i in index]
+            scale = random.uniform(0.8, 1.25)
+
+        else:
+            view_ids = [ref_view] + src_views[:self.nviews - 1]
+            scale = 1
+
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+
+        mask = None
+        depth = None
+        depth_min = None
+        depth_max = None
+
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath,
+                                    'Rectified/{}_train/rect_{:0>3}_{}_r5000.png'.format(scan, vid + 1, light_idx))
+            proj_mat_filename = os.path.join(self.datapath, 'Cameras_1/{}_train/{:0>8}_cam.txt').format(scan, vid)
+
+            mask_filename = os.path.join(self.datapath, 'Depths_raw/{}/depth_visual_{:0>4}.png'.format(scan, vid))
+            depth_filename = os.path.join(self.datapath, 'Depths_raw/{}/depth_map_{:0>4}.pfm'.format(scan, vid))
+
+            imgs = self.read_img(img_filename)
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(proj_mat_filename)
+            extrinsics[:3,3] *= scale
+            intrinsics[0] *= 4
+            intrinsics[1] *= 4
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)
+
+            if i == 0:  # reference view
+                depth_min = depth_min_ * scale
+                depth_max = depth_max_ * scale
+                depth, mask = self.read_depth_mask(depth_filename, mask_filename, scale)
+
+                for l in range(self.levels):
+                    mask[f'level_{l}'] = np.expand_dims(mask[f'level_{l}'],2)
+                    mask[f'level_{l}'] = mask[f'level_{l}'].transpose([2,0,1])
+                    depth[f'level_{l}'] = np.expand_dims(depth[f'level_{l}'],2)
+                    depth[f'level_{l}'] = depth[f'level_{l}'].transpose([2,0,1])
+
+        # imgs: N*3*H0*W0, N is number of images
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+
+        # data is numpy array
+        return {"imgs": imgs,                   # [N, 3, H, W]
+                "proj_matrices": proj,          # [N,4,4]
+                "depth": depth,                 # [1, H, W]
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,         # scalar
+                "mask": mask}                   # [1, H, W]
+
--- a/IGEV-MVS/datasets/dtu_yao_eval.py
+++ b/IGEV-MVS/datasets/dtu_yao_eval.py
@ -0,0 +1,158 @@
+from torch.utils.data import Dataset
+import numpy as np
+import os
+from PIL import Image
+from datasets.data_io import *
+import cv2
+
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, listfile, nviews=5, img_wh=(1600, 1152)):
+        super(MVSDataset, self).__init__()
+        self.levels = 4
+        self.datapath = datapath
+        self.listfile = listfile
+        self.nviews = nviews
+        self.img_wh = img_wh
+        self.metas = self.build_list()
+
+    def build_list(self):
+        metas = []
+        with open(self.listfile) as f:
+            scans = f.readlines()
+            scans = [line.rstrip() for line in scans]
+
+        for scan in scans:
+            pair_file = "{}/pair.txt".format(scan)
+            # read the pair file
+            with open(os.path.join(self.datapath, pair_file)) as f:
+                num_viewpoint = int(f.readline())
+                # viewpoints (49)
+                for view_idx in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                    metas.append((scan, ref_view, src_views))
+        print("dataset", "metas:", len(metas))
+        return metas
+
+    def __len__(self):
+        return len(self.metas)
+
+    def read_cam_file(self, filename):
+        with open(filename) as f:
+            lines = f.readlines()
+            lines = [line.rstrip() for line in lines]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+
+        depth_min = float(lines[11].split()[0])
+        depth_max = float(lines[11].split()[-1])
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    
+    def read_mask(self, filename):
+        img = Image.open(filename)
+        np_img = np.array(img, dtype=np.float32)
+        np_img = (np_img > 10).astype(np.float32)
+        return np_img
+
+    def read_img(self, filename):
+        img = Image.open(filename)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        np_img = cv2.resize(np_img, self.img_wh, interpolation=cv2.INTER_LINEAR)
+
+        h, w, _ = np_img.shape
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR),
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms
+
+    def __getitem__(self, idx):
+        scan, ref_view, src_views = self.metas[idx]
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.nviews - 1]
+        img_w = 1600
+        img_h = 1200
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+
+        depth_min = None
+        depth_max = None
+
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath, '{}/images/{:0>8}.jpg'.format(scan, vid))
+            proj_mat_filename = os.path.join(self.datapath, '{}/cams_1/{:0>8}_cam.txt'.format(scan, vid))
+
+            imgs = self.read_img(img_filename)
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(proj_mat_filename)
+            intrinsics[0] *= self.img_wh[0]/img_w
+            intrinsics[1] *= self.img_wh[1]/img_h
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)
+
+
+            if i == 0:  # reference view
+                depth_min = depth_min_
+                depth_max = depth_max_
+
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+
+        return {"imgs": imgs,                   # N*3*H0*W0
+                "proj_matrices": proj,          # N*4*4
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,         # scalar
+                "filename": scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + "{}"}
--- a/IGEV-MVS/datasets/eth3d.py
+++ b/IGEV-MVS/datasets/eth3d.py
@ -0,0 +1,158 @@
+from torch.utils.data import Dataset
+from datasets.data_io import *
+import os
+import numpy as np
+import cv2
+from PIL import Image
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, split='test', n_views=7, img_wh=(1920,1280)):
+        self.levels = 4
+        self.datapath = datapath
+        self.img_wh = img_wh
+        self.split = split
+        self.build_metas()
+        self.n_views = n_views
+
+    def build_metas(self):
+        self.metas = []
+        if self.split == "test":
+            self.scans = ['botanical_garden', 'boulders', 'bridge', 'door',
+                'exhibition_hall', 'lecture_room', 'living_room', 'lounge',
+                'observatory', 'old_computer', 'statue', 'terrace_2']
+
+        elif self.split == "train":
+            self.scans = ['courtyard', 'delivery_area', 'electro', 'facade',
+                    'kicker', 'meadow', 'office', 'pipes', 'playground',
+                    'relief', 'relief_2', 'terrace', 'terrains']
+        
+
+        for scan in self.scans:
+            with open(os.path.join(self.datapath, scan, 'pair.txt')) as f:
+                num_viewpoint = int(f.readline())
+                for view_idx in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                    if len(src_views) != 0:
+                        self.metas += [(scan, -1, ref_view, src_views)]
+                    
+
+    def read_cam_file(self, filename):
+        with open(filename) as f:
+            lines = [line.rstrip() for line in f.readlines()]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ')
+        extrinsics = extrinsics.reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ')
+        intrinsics = intrinsics.reshape((3, 3))
+        
+        depth_min = float(lines[11].split()[0])
+        if depth_min < 0:
+            depth_min = 1
+        depth_max = float(lines[11].split()[-1])
+
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    def read_img(self, filename, h, w):
+        img = Image.open(filename)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        original_h, original_w, _ = np_img.shape
+        np_img = cv2.resize(np_img, self.img_wh, interpolation=cv2.INTER_LINEAR)
+        
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR),
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms, original_h, original_w
+
+    def __len__(self):
+        return len(self.metas)
+
+    def __getitem__(self, idx):
+        scan, _, ref_view, src_views = self.metas[idx]
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.n_views-1]
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+
+        # depth = None
+        depth_min = None
+        depth_max = None
+
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath,  scan, f'images/{vid:08d}.jpg')
+            proj_mat_filename = os.path.join(self.datapath, scan, f'cams_1/{vid:08d}_cam.txt')
+
+            imgs, original_h, original_w = self.read_img(img_filename,self.img_wh[1], self.img_wh[0])
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(proj_mat_filename)
+            intrinsics[0] *= self.img_wh[0]/original_w
+            intrinsics[1] *= self.img_wh[1]/original_h
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)
+
+            if i == 0:  # reference view
+                depth_min = depth_min_
+                depth_max = depth_max_
+
+        # imgs: N*3*H0*W0, N is number of images
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+
+        return {"imgs": imgs,                   # N*3*H0*W0
+                "proj_matrices": proj, # N*4*4
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,
+                "filename": scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + "{}"
+                }  
--- a/IGEV-MVS/datasets/tanks.py
+++ b/IGEV-MVS/datasets/tanks.py
@ -0,0 +1,156 @@
+from torch.utils.data import Dataset
+from datasets.data_io import *
+import os
+import numpy as np
+import cv2
+from PIL import Image
+
+class MVSDataset(Dataset):
+    def __init__(self, datapath, n_views=7, img_wh=(1920, 1024), split='intermediate'):
+        self.levels = 4
+        self.datapath = datapath
+        self.img_wh = img_wh
+        self.split = split
+        self.build_metas()
+        self.n_views = n_views
+
+    def build_metas(self):
+        self.metas = []
+        if self.split == 'intermediate':
+            self.scans = ['Family', 'Francis', 'Horse', 'Lighthouse',
+                          'M60', 'Panther', 'Playground', 'Train']
+
+        elif self.split == 'advanced':
+            self.scans = ['Auditorium', 'Ballroom', 'Courtroom',
+                          'Museum', 'Palace', 'Temple']
+
+        for scan in self.scans:
+            with open(os.path.join(self.datapath, self.split, scan, 'pair.txt')) as f:
+                num_viewpoint = int(f.readline())
+                for view_idx in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+                    if len(src_views) != 0:
+                        self.metas += [(scan, -1, ref_view, src_views)]
+   
+    def read_cam_file(self, filename):
+        with open(filename) as f:
+            lines = [line.rstrip() for line in f.readlines()]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ')
+        extrinsics = extrinsics.reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ')
+        intrinsics = intrinsics.reshape((3, 3))
+        
+        depth_min = float(lines[11].split()[0])
+        depth_max = float(lines[11].split()[-1])
+
+        return intrinsics, extrinsics, depth_min, depth_max
+
+    def read_img(self, filename, h, w):
+        img = Image.open(filename)
+        # scale 0~255 to -1~1
+        np_img = 2*np.array(img, dtype=np.float32) / 255. - 1
+        original_h, original_w, _ = np_img.shape
+        np_img = cv2.resize(np_img, self.img_wh, interpolation=cv2.INTER_LINEAR)
+        
+        np_img_ms = {
+            "level_3": cv2.resize(np_img, (w//8, h//8), interpolation=cv2.INTER_LINEAR),
+            "level_2": cv2.resize(np_img, (w//4, h//4), interpolation=cv2.INTER_LINEAR),
+            "level_1": cv2.resize(np_img, (w//2, h//2), interpolation=cv2.INTER_LINEAR),
+            "level_0": np_img
+        }
+        return np_img_ms, original_h, original_w
+
+    def __len__(self):
+        return len(self.metas)
+
+    def __getitem__(self, idx):
+        scan, _, ref_view, src_views = self.metas[idx]
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.n_views-1]
+
+        imgs_0 = []
+        imgs_1 = []
+        imgs_2 = []
+        imgs_3 = []
+
+        # depth = None
+        depth_min = None
+        depth_max = None
+
+        proj_matrices_0 = []
+        proj_matrices_1 = []
+        proj_matrices_2 = []
+        proj_matrices_3 = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(self.datapath, self.split, scan, f'images/{vid:08d}.jpg')
+            proj_mat_filename = os.path.join(self.datapath, self.split, scan, f'cams_1/{vid:08d}_cam.txt')
+
+            imgs, original_h, original_w = self.read_img(img_filename,self.img_wh[1], self.img_wh[0])
+            imgs_0.append(imgs['level_0'])
+            imgs_1.append(imgs['level_1'])
+            imgs_2.append(imgs['level_2'])
+            imgs_3.append(imgs['level_3'])
+
+            intrinsics, extrinsics, depth_min_, depth_max_ = self.read_cam_file(proj_mat_filename)
+            intrinsics[0] *= self.img_wh[0]/original_w
+            intrinsics[1] *= self.img_wh[1]/original_h
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 0.125
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_3.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_2.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_1.append(proj_mat)
+
+            proj_mat = extrinsics.copy()
+            intrinsics[:2,:] *= 2
+            proj_mat[:3, :4] = np.matmul(intrinsics, proj_mat[:3, :4])
+            proj_matrices_0.append(proj_mat)
+
+
+            if i == 0:  # reference view
+                depth_min =  depth_min_
+                depth_max = depth_max_
+
+        # imgs: N*3*H0*W0, N is number of images
+        imgs_0 = np.stack(imgs_0).transpose([0, 3, 1, 2])
+        imgs_1 = np.stack(imgs_1).transpose([0, 3, 1, 2])
+        imgs_2 = np.stack(imgs_2).transpose([0, 3, 1, 2])
+        imgs_3 = np.stack(imgs_3).transpose([0, 3, 1, 2])
+        imgs = {}
+        imgs['level_0'] = imgs_0
+        imgs['level_1'] = imgs_1
+        imgs['level_2'] = imgs_2
+        imgs['level_3'] = imgs_3
+        # proj_matrices: N*4*4
+        proj_matrices_0 = np.stack(proj_matrices_0)
+        proj_matrices_1 = np.stack(proj_matrices_1)
+        proj_matrices_2 = np.stack(proj_matrices_2)
+        proj_matrices_3 = np.stack(proj_matrices_3)
+        proj={}
+        proj['level_3']=proj_matrices_3
+        proj['level_2']=proj_matrices_2
+        proj['level_1']=proj_matrices_1
+        proj['level_0']=proj_matrices_0
+
+        
+
+
+        return {"imgs": imgs,                   # N*3*H0*W0
+                "proj_matrices": proj, # N*4*4
+                "depth_min": depth_min,         # scalar
+                "depth_max": depth_max,
+                "filename": scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + "{}"
+                }  
--- a/IGEV-MVS/evaluate_mvs.py
+++ b/IGEV-MVS/evaluate_mvs.py
@ -0,0 +1,450 @@
+import argparse
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+import numpy as np
+import time
+from datasets import find_dataset_def
+from core.igev_mvs import IGEVMVS
+from utils import *
+import sys
+import cv2
+from datasets.data_io import read_pfm, save_pfm
+from core.submodule import depth_unnormalization
+from plyfile import PlyData, PlyElement
+from tqdm import tqdm
+from PIL import Image
+
+cudnn.benchmark = True
+
+parser = argparse.ArgumentParser(description='Predict depth, filter, and fuse')
+parser.add_argument('--model', default='IterMVS', help='select model')
+parser.add_argument('--dataset', default='dtu_yao_eval', help='select dataset')
+parser.add_argument('--testpath', default='/data/dtu_data/dtu_test/', help='testing data path')
+parser.add_argument('--testlist', default='./lists/dtu/test.txt', help='testing scan list')
+parser.add_argument('--maxdisp', default=256)
+parser.add_argument('--split', default='intermediate', help='select data')
+parser.add_argument('--batch_size', type=int, default=2, help='testing batch size')
+parser.add_argument('--n_views', type=int, default=5, help='num of view')
+parser.add_argument('--img_wh', nargs='+', type=int, default=[640, 480],
+        help='height and width of the image')
+parser.add_argument('--loadckpt', default='./pretrained_models/dtu.ckpt', help='load a specific checkpoint')
+parser.add_argument('--outdir', default='./output/', help='output dir')
+parser.add_argument('--display', action='store_true', help='display depth images and masks')
+parser.add_argument('--iteration', type=int, default=32, help='num of iteration of GRU')
+parser.add_argument('--geo_pixel_thres', type=float, default=1, help='pixel threshold for geometric consistency filtering')
+parser.add_argument('--geo_depth_thres', type=float, default=0.01, help='depth threshold for geometric consistency filtering')
+parser.add_argument('--photo_thres', type=float, default=0.3, help='threshold for photometric consistency filtering')
+
+# parse arguments and check
+args = parser.parse_args()
+print("argv:", sys.argv[1:])
+print_args(args)
+
+if args.dataset=="dtu_yao_eval":
+    img_wh=(1600, 1152)
+elif args.dataset=="tanks":
+    img_wh=(1920, 1024)
+elif args.dataset=="eth3d":
+    img_wh = (1920,1280)
+else:
+    img_wh = (args.img_wh[0], args.img_wh[1]) # custom dataset
+
+# read intrinsics and extrinsics
+def read_camera_parameters(filename):
+    with open(filename) as f:
+        lines = f.readlines()
+        lines = [line.rstrip() for line in lines]
+    # extrinsics: line [1,5), 4x4 matrix
+    extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+    # intrinsics: line [7-10), 3x3 matrix
+    intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+    
+    return intrinsics, extrinsics
+
+
+# read an image
+def read_img(filename, img_wh):
+    img = Image.open(filename)
+    # scale 0~255 to 0~1
+    np_img = np.array(img, dtype=np.float32) / 255.
+    original_h, original_w, _ = np_img.shape
+    np_img = cv2.resize(np_img, img_wh, interpolation=cv2.INTER_LINEAR)
+    return np_img, original_h, original_w
+
+
+# save a binary mask
+def save_mask(filename, mask):
+    assert mask.dtype == np.bool_
+    mask = mask.astype(np.uint8) * 255
+    Image.fromarray(mask).save(filename)
+
+def save_depth_img(filename, depth):
+    # assert mask.dtype == np.bool
+    depth = depth.astype(np.float32) * 255
+    Image.fromarray(depth).save(filename)
+
+
+def read_pair_file(filename):
+    data = []
+    with open(filename) as f:
+        num_viewpoint = int(f.readline())
+        # 49 viewpoints
+        for view_idx in range(num_viewpoint):
+            ref_view = int(f.readline().rstrip())
+            src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+            if len(src_views) != 0:
+                data.append((ref_view, src_views))
+    return data
+
+
+# run MVS model to save depth maps
+def save_depth():
+    # dataset, dataloader
+    MVSDataset = find_dataset_def(args.dataset)
+    if args.dataset=="dtu_yao_eval":
+        test_dataset = MVSDataset(args.testpath, args.testlist, args.n_views, img_wh)
+    elif args.dataset=="tanks":
+        test_dataset = MVSDataset(args.testpath, args.n_views, img_wh, args.split)
+    elif args.dataset=="eth3d":
+        test_dataset = MVSDataset(args.testpath, args.split, args.n_views, img_wh)
+    else:
+        test_dataset = MVSDataset(args.testpath, args.n_views, img_wh)
+    TestImgLoader = DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=4, drop_last=False)
+
+    # model
+    model = IGEVMVS(args)
+    model = nn.DataParallel(model)
+    model.cuda()
+
+    # load checkpoint file specified by args.loadckpt
+    print("loading model {}".format(args.loadckpt))
+    state_dict = torch.load(args.loadckpt)
+    model.load_state_dict(state_dict['model'])
+    model.eval()
+    
+    with torch.no_grad():
+        tbar = tqdm(TestImgLoader)
+        for batch_idx, sample in enumerate(tbar):
+            start_time = time.time()
+            sample_cuda = tocuda(sample)
+            disp_prediction = model(sample_cuda["imgs"], sample_cuda["proj_matrices"],
+                        sample_cuda["depth_min"], sample_cuda["depth_max"], test_mode=True)
+
+            b = sample_cuda["depth_min"].shape[0]
+
+            inverse_depth_min = (1.0 / sample_cuda["depth_min"]).view(b, 1, 1, 1)
+            inverse_depth_max = (1.0 / sample_cuda["depth_max"]).view(b, 1, 1, 1)
+
+            depth_prediction = depth_unnormalization(disp_prediction, inverse_depth_min, inverse_depth_max)
+            depth_prediction = tensor2numpy(depth_prediction.float())
+            del sample_cuda, disp_prediction
+            tbar.set_description('Iter {}/{}, time = {:.3f}'.format(batch_idx, len(TestImgLoader), time.time() - start_time))
+            filenames = sample["filename"]
+
+            # save depth maps and confidence maps
+            for filename, depth_est in zip(filenames, depth_prediction):
+                depth_filename = os.path.join(args.outdir, filename.format('depth_est', '.pfm'))
+                os.makedirs(depth_filename.rsplit('/', 1)[0], exist_ok=True)
+                # save depth maps
+                depth_est = np.squeeze(depth_est, 0)
+                save_pfm(depth_filename, depth_est)
+
+# project the reference point cloud into the source view, then project back
+def reproject_with_depth(depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src, extrinsics_src):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    ## step1. project reference pixels to the source view
+    # reference view x, y
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    x_ref, y_ref = x_ref.reshape([-1]), y_ref.reshape([-1])
+    # reference 3D space
+    xyz_ref = np.matmul(np.linalg.inv(intrinsics_ref),
+                        np.vstack((x_ref, y_ref, np.ones_like(x_ref))) * depth_ref.reshape([-1]))
+    # source 3D space
+    xyz_src = np.matmul(np.matmul(extrinsics_src, np.linalg.inv(extrinsics_ref)),
+                        np.vstack((xyz_ref, np.ones_like(x_ref))))[:3]
+    # source view x, y
+    K_xyz_src = np.matmul(intrinsics_src, xyz_src)
+    xy_src = K_xyz_src[:2] / K_xyz_src[2:3]
+
+    ## step2. reproject the source view points with source view depth estimation
+    # find the depth estimation of the source view
+    x_src = xy_src[0].reshape([height, width]).astype(np.float32)
+    y_src = xy_src[1].reshape([height, width]).astype(np.float32)
+    sampled_depth_src = cv2.remap(depth_src, x_src, y_src, interpolation=cv2.INTER_LINEAR)
+    # mask = sampled_depth_src > 0
+
+    # source 3D space
+    # NOTE that we should use sampled source-view depth_here to project back
+    xyz_src = np.matmul(np.linalg.inv(intrinsics_src),
+                        np.vstack((xy_src, np.ones_like(x_ref))) * sampled_depth_src.reshape([-1]))
+    # reference 3D space
+    xyz_reprojected = np.matmul(np.matmul(extrinsics_ref, np.linalg.inv(extrinsics_src)),
+                                np.vstack((xyz_src, np.ones_like(x_ref))))[:3]
+    # source view x, y, depth
+    depth_reprojected = xyz_reprojected[2].reshape([height, width]).astype(np.float32)
+    K_xyz_reprojected = np.matmul(intrinsics_ref, xyz_reprojected)
+    xy_reprojected = K_xyz_reprojected[:2] / (K_xyz_reprojected[2:3]+1e-6)
+    x_reprojected = xy_reprojected[0].reshape([height, width]).astype(np.float32)
+    y_reprojected = xy_reprojected[1].reshape([height, width]).astype(np.float32)
+
+    return depth_reprojected, x_reprojected, y_reprojected, x_src, y_src
+
+
+def check_geometric_consistency(depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src, extrinsics_src, thre1, thre2):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    depth_reprojected, x2d_reprojected, y2d_reprojected, x2d_src, y2d_src = reproject_with_depth(depth_ref,
+                                                                                                 intrinsics_ref,
+                                                                                                 extrinsics_ref,
+                                                                                                 depth_src,
+                                                                                                 intrinsics_src,
+                                                                                                 extrinsics_src)
+    # check |p_reproj-p_1| < 1
+    dist = np.sqrt((x2d_reprojected - x_ref) ** 2 + (y2d_reprojected - y_ref) ** 2)
+
+    # check |d_reproj-d_1| / d_1 < 0.01
+    depth_diff = np.abs(depth_reprojected - depth_ref)
+    relative_depth_diff = depth_diff / depth_ref
+    masks=[]
+    for i in range(2,11):
+        mask = np.logical_and(dist < i/thre1, relative_depth_diff < i/thre2)
+        masks.append(mask)
+    depth_reprojected[~mask] = 0
+
+    return masks, mask, depth_reprojected, x2d_src, y2d_src
+
+
+def filter_depth(scan_folder, out_folder, plyfilename, geo_pixel_thres, geo_depth_thres, photo_thres, img_wh, geo_mask_thres=3):
+    # the pair file
+    pair_file = os.path.join(scan_folder, "pair.txt")
+    # for the final point cloud
+    vertexs = []
+    vertex_colors = []
+
+    pair_data = read_pair_file(pair_file)
+    nviews = len(pair_data)
+
+    thre_left = -2
+    thre_right = 2
+    total_iter = 10
+    for iter in range(total_iter):
+        thre = (thre_left + thre_right) / 2
+        print(f"{iter} {10 ** thre}")
+        depth_est_averaged = []
+        geo_mask_all = []
+        # for each reference view and the corresponding source views
+        for ref_view, src_views in pair_data:
+            # load the camera parameters
+            ref_intrinsics, ref_extrinsics = read_camera_parameters(
+                os.path.join(scan_folder, 'cams_1/{:0>8}_cam.txt'.format(ref_view)))
+            ref_img, original_h, original_w = read_img(os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(ref_view)), img_wh)
+            ref_intrinsics[0] *= img_wh[0]/original_w
+            ref_intrinsics[1] *= img_wh[1]/original_h
+            # load the estimated depth of the reference view
+            ref_depth_est = read_pfm(os.path.join(out_folder, 'depth_est/{:0>8}.pfm'.format(ref_view)))[0]
+            ref_depth_est = np.squeeze(ref_depth_est, 2)
+
+            all_srcview_depth_ests = []
+            # compute the geometric mask
+            geo_mask_sum = 0
+            geo_mask_sums=[]
+            n = 1 + len(src_views)
+            ct = 0
+            for src_view in src_views:
+                ct = ct + 1
+                # camera parameters of the source view
+                src_intrinsics, src_extrinsics = read_camera_parameters(
+                    os.path.join(scan_folder, 'cams_1/{:0>8}_cam.txt'.format(src_view)))
+                _, original_h, original_w = read_img(os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(src_view)), img_wh)
+                src_intrinsics[0] *= img_wh[0]/original_w
+                src_intrinsics[1] *= img_wh[1]/original_h
+
+                # the estimated depth of the source view
+                src_depth_est = read_pfm(os.path.join(out_folder, 'depth_est/{:0>8}.pfm'.format(src_view)))[0]
+                
+
+                masks, geo_mask, depth_reprojected, _, _ = check_geometric_consistency(ref_depth_est, ref_intrinsics, ref_extrinsics,
+                                                                        src_depth_est,
+                                                                        src_intrinsics, src_extrinsics, 10 ** thre * 4, 10 ** thre * 1300)
+                if (ct==1):
+                    for i in range(2,n):
+                        geo_mask_sums.append(masks[i-2].astype(np.int32))
+                else:
+                    for i in range(2,n):
+                        geo_mask_sums[i-2]+=masks[i-2].astype(np.int32)
+
+                geo_mask_sum+=geo_mask.astype(np.int32)
+                all_srcview_depth_ests.append(depth_reprojected)
+
+            geo_mask=geo_mask_sum>=n
+            for i in range (2,n):
+                geo_mask=np.logical_or(geo_mask,geo_mask_sums[i-2]>=i)
+
+            depth_est_averaged.append((sum(all_srcview_depth_ests) + ref_depth_est) / (geo_mask_sum + 1))
+            geo_mask_all.append(np.mean(geo_mask))
+            final_mask = geo_mask
+
+            if iter == total_iter - 1:
+                os.makedirs(os.path.join(out_folder, "mask"), exist_ok=True)
+                save_mask(os.path.join(out_folder, "mask/{:0>8}_geo.png".format(ref_view)), geo_mask)
+                save_mask(os.path.join(out_folder, "mask/{:0>8}_final.png".format(ref_view)), final_mask)
+
+                print("processing {}, ref-view{:0>2}, geo_mask:{:3f} final_mask: {:3f}".format(scan_folder, ref_view,
+                                                                        geo_mask.mean(), final_mask.mean()))
+
+                if args.display:
+                    cv2.imshow('ref_img', ref_img[:, :, ::-1])
+                    cv2.imshow('ref_depth', ref_depth_est / np.max(ref_depth_est))
+                    cv2.imshow('ref_depth * geo_mask', ref_depth_est * geo_mask.astype(np.float32) / np.max(ref_depth_est))
+                    cv2.imshow('ref_depth * mask', ref_depth_est * final_mask.astype(np.float32) / np.max(ref_depth_est))
+                    cv2.waitKey(0)
+
+                height, width = depth_est_averaged[-1].shape[:2]
+                x, y = np.meshgrid(np.arange(0, width), np.arange(0, height))
+
+                valid_points = final_mask
+                # print("valid_points", valid_points.mean())
+                x, y, depth = x[valid_points], y[valid_points], depth_est_averaged[-1][valid_points]
+                
+                color = ref_img[valid_points]
+                xyz_ref = np.matmul(np.linalg.inv(ref_intrinsics),
+                                    np.vstack((x, y, np.ones_like(x))) * depth)
+                xyz_world = np.matmul(np.linalg.inv(ref_extrinsics),
+                                    np.vstack((xyz_ref, np.ones_like(x))))[:3]
+                vertexs.append(xyz_world.transpose((1, 0)))
+                vertex_colors.append((color * 255).astype(np.uint8))
+        if np.mean(geo_mask_all) >= 0.25:
+            thre_left = thre
+        else:
+            thre_right = thre
+    vertexs = np.concatenate(vertexs, axis=0)
+    vertex_colors = np.concatenate(vertex_colors, axis=0)
+    vertexs = np.array([tuple(v) for v in vertexs], dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+    vertex_colors = np.array([tuple(v) for v in vertex_colors], dtype=[('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])
+
+    vertex_all = np.empty(len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr)
+    for prop in vertexs.dtype.names:
+        vertex_all[prop] = vertexs[prop]
+    for prop in vertex_colors.dtype.names:
+        vertex_all[prop] = vertex_colors[prop]
+
+    el = PlyElement.describe(vertex_all, 'vertex')
+    PlyData([el]).write(plyfilename)
+    print("saving the final model to", plyfilename)
+
+
+if __name__ == '__main__':
+    save_depth()
+    if args.dataset=="dtu_yao_eval":
+        with open(args.testlist) as f:
+            scans = f.readlines()
+            scans = [line.rstrip() for line in scans]
+
+        for scan in scans:
+            scan_id = int(scan[4:])
+            scan_folder = os.path.join(args.testpath, scan)
+            out_folder = os.path.join(args.outdir, scan)
+            filter_depth(scan_folder, out_folder, os.path.join(args.outdir, 'igev_mvs{:0>3}_l3.ply'.format(scan_id)), 
+                        args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, 4)
+    elif args.dataset=="tanks":
+        # intermediate dataset
+        if args.split == "intermediate":
+            scans = ['Family', 'Francis', 'Horse', 'Lighthouse',
+                    'M60', 'Panther', 'Playground', 'Train']
+            geo_mask_thres = {'Family': 5,
+                                'Francis': 6,
+                                'Horse': 5,
+                                'Lighthouse': 6,
+                                'M60': 5,
+                                'Panther': 5,
+                                'Playground': 5,
+                                'Train': 5}
+
+            for scan in scans:
+                scan_folder = os.path.join(args.testpath, args.split, scan)
+                out_folder = os.path.join(args.outdir, scan)
+
+                filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'), 
+                    args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
+
+        # advanced dataset
+        elif args.split == "advanced":
+            scans = ['Auditorium', 'Ballroom', 'Courtroom',
+                    'Museum', 'Palace', 'Temple']
+            geo_mask_thres = {'Auditorium': 3,
+                                'Ballroom': 4,
+                                'Courtroom': 4,
+                                'Museum': 4,
+                                'Palace': 5,
+                                'Temple': 4}
+
+            for scan in scans:
+                scan_folder = os.path.join(args.testpath, args.split, scan)
+                out_folder = os.path.join(args.outdir, scan)
+                filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'), 
+                    args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])
+
+    elif args.dataset=="eth3d":
+        if args.split == "test":
+            scans = ['botanical_garden', 'boulders', 'bridge', 'door',
+                    'exhibition_hall', 'lecture_room', 'living_room', 'lounge',
+                    'observatory', 'old_computer', 'statue', 'terrace_2']
+            
+            geo_mask_thres = {'botanical_garden':1,  # 30 images, outdoor
+                    'boulders':1, # 26 images, outdoor
+                    'bridge':2,  # 110 images, outdoor
+                    'door':2, # 6 images, indoor
+                    'exhibition_hall':2,  # 68 images, indoor
+                    'lecture_room':2, # 23 images, indoor
+                    'living_room':2, # 65 images, indoor
+                    'lounge':1,# 10 images, indoor
+                    'observatory':2, # 27 images, outdoor
+                    'old_computer':2, # 54 images, indoor
+                    'statue':2,  # 10 images, indoor
+                    'terrace_2':2 # 13 images, outdoor
+                    }
+            for scan in scans:
+                start_time = time.time()
+                scan_folder = os.path.join(args.testpath, scan)
+                out_folder = os.path.join(args.outdir, scan)
+                filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'), 
+                            args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan]) 
+                print('scan: '+scan+' time = {:3f}'.format(time.time() - start_time))
+
+        elif args.split == "train":
+            scans = ['courtyard', 'delivery_area', 'electro', 'facade',
+                    'kicker', 'meadow', 'office', 'pipes', 'playground',
+                    'relief', 'relief_2', 'terrace', 'terrains']
+
+            geo_mask_thres = {'courtyard':1,  # 38 images, outdoor
+                    'delivery_area':2, # 44 images, indoor
+                    'electro':1,  # 45 images, outdoor
+                    'facade':2, # 76 images, outdoor
+                    'kicker':1,  # 31 images, indoor
+                    'meadow':1, # 15 images, outdoor
+                    'office':1, # 26 images, indoor
+                    'pipes':1,# 14 images, indoor
+                    'playground':1, # 38 images, outdoor
+                    'relief':1, # 31 images, indoor
+                    'relief_2':1, # 31 images, indoor
+                    'terrace':1,  # 23 images, outdoor
+                    'terrains':2 # 42 images, indoor
+                    }
+
+            for scan in scans:
+                start_time = time.time()
+                scan_folder = os.path.join(args.testpath, scan)
+                out_folder = os.path.join(args.outdir, scan)
+                filter_depth(scan_folder, out_folder, os.path.join(args.outdir, scan + '.ply'), 
+                            args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres[scan])   
+                print('scan: '+scan+' time = {:3f}'.format(time.time() - start_time))
+    else:
+        filter_depth(args.testpath, args.outdir, os.path.join(args.outdir, 'custom.ply'), 
+                    args.geo_pixel_thres, args.geo_depth_thres, args.photo_thres, img_wh, geo_mask_thres=3) 
--- a/IGEV-MVS/evaluations/dtu/BaseEval2Obj_web.m
+++ b/IGEV-MVS/evaluations/dtu/BaseEval2Obj_web.m
@ -0,0 +1,44 @@
+function BaseEval2Obj_web(BaseEval,method_string,outputPath)
+
+if(nargin<3)
+    outputPath='./';
+end
+
+% tresshold for coloring alpha channel in the range of 0-10 mm
+dist_tresshold=10;
+
+cSet=BaseEval.cSet;
+
+Qdata=BaseEval.Qdata;
+alpha=min(BaseEval.Ddata,dist_tresshold)/dist_tresshold;
+
+fid=fopen([outputPath method_string '2Stl_' num2str(cSet) ' .obj'],'w+');
+
+for cP=1:size(Qdata,2)
+    if(BaseEval.DataInMask(cP))
+        C=[1 0 0]*alpha(cP)+[1 1 1]*(1-alpha(cP)); %coloring from red to white in the range of 0-10 mm (0 to dist_tresshold)
+    else
+        C=[0 1 0]*alpha(cP)+[0 0 1]*(1-alpha(cP)); %green to blue for points outside the mask (which are not included in the analysis)
+    end
+    fprintf(fid,'v %f %f %f %f %f %f\n',[Qdata(1,cP) Qdata(2,cP) Qdata(3,cP) C(1) C(2) C(3)]);
+end
+fclose(fid);
+
+disp('Data2Stl saved as obj')
+
+Qstl=BaseEval.Qstl;
+fid=fopen([outputPath 'Stl2' method_string '_' num2str(cSet) '.obj'],'w+');
+
+alpha=min(BaseEval.Dstl,dist_tresshold)/dist_tresshold;
+
+for cP=1:size(Qstl,2)
+    if(BaseEval.StlAbovePlane(cP))
+        C=[1 0 0]*alpha(cP)+[1 1 1]*(1-alpha(cP)); %coloring from red to white in the range of 0-10 mm (0 to dist_tresshold)
+    else
+        C=[0 1 0]*alpha(cP)+[0 0 1]*(1-alpha(cP)); %green to blue for points below plane (which are not included in the analysis)
+    end
+    fprintf(fid,'v %f %f %f %f %f %f\n',[Qstl(1,cP) Qstl(2,cP) Qstl(3,cP) C(1) C(2) C(3)]);
+end
+fclose(fid);
+
+disp('Stl2Data saved as obj')
--- a/IGEV-MVS/evaluations/dtu/BaseEvalMain_web.m
+++ b/IGEV-MVS/evaluations/dtu/BaseEvalMain_web.m
@ -0,0 +1,104 @@
+clear all
+close all
+format compact
+clc
+
+% script to calculate distances have been measured for all included scans (UsedSets)
+
+dataPath='D:\xgw\IterMVS_data\MVS Data\';
+plyPath='/data/xgw/IGEV_MVS/conf_03/';
+resultsPath='/data/xgw/IGEV_MVS/outputs_conf_03/';
+
+
+method_string='itermvs';
+light_string='l3'; % l3 is the setting with all lights on, l7 is randomly sampled between the 7 settings (index 0-6)
+representation_string='Points'; %mvs representation 'Points' or 'Surfaces'
+
+switch representation_string
+    case 'Points'
+        eval_string='_Eval_'; %results naming
+        settings_string='';
+end
+
+% get sets used in evaluation
+UsedSets=[1 4 9 10 11 12 13 15 23 24 29 32 33 34 48 49 62 75 77 110 114 118];
+
+result = zeros(length(UsedSets),4);
+
+dst=0.2;    %Min dist between points when reducing
+
+for cIdx=1:length(UsedSets)
+    %Data set number
+    cSet = UsedSets(cIdx)
+    %input data name
+    DataInName=[plyPath sprintf('%s%03d_%s%s.ply',lower(method_string),cSet,light_string,settings_string)]
+    
+    %results name
+    %concatenate strings into one string
+    EvalName=[resultsPath method_string eval_string num2str(cSet) '.mat']
+
+    disp(EvalName)
+    
+    %check if file is already computed
+    if(~exist(EvalName,'file'))
+        disp(DataInName);
+        
+        time=clock;time(4:5), drawnow
+        
+        tic
+        Mesh = plyread(DataInName);
+        Qdata=[Mesh.vertex.x Mesh.vertex.y Mesh.vertex.z]';
+        toc
+        
+        BaseEval=PointCompareMain(cSet,Qdata,dst,dataPath);
+        
+        disp('Saving results'), drawnow
+        toc
+        save(EvalName,'BaseEval');
+        toc
+        
+        % write obj-file of evaluation
+%         BaseEval2Obj_web(BaseEval,method_string, resultsPath)
+%         toc
+        time=clock;time(4:5), drawnow
+    
+        BaseEval.MaxDist=20; %outlier threshold of 20 mm
+        
+        BaseEval.FilteredDstl=BaseEval.Dstl(BaseEval.StlAbovePlane); %use only points that are above the plane 
+        BaseEval.FilteredDstl=BaseEval.FilteredDstl(BaseEval.FilteredDstl<BaseEval.MaxDist); % discard outliers
+    
+        BaseEval.FilteredDdata=BaseEval.Ddata(BaseEval.DataInMask); %use only points that within mask
+        BaseEval.FilteredDdata=BaseEval.FilteredDdata(BaseEval.FilteredDdata<BaseEval.MaxDist); % discard outliers
+        
+        fprintf("mean/median Data (acc.) %f/%f\n", mean(BaseEval.FilteredDdata), median(BaseEval.FilteredDdata));
+        fprintf("mean/median Stl (comp.) %f/%f\n", mean(BaseEval.FilteredDstl), median(BaseEval.FilteredDstl));
+        result(cIdx,1) = mean(BaseEval.FilteredDdata);
+        result(cIdx,2) = median(BaseEval.FilteredDdata);
+        result(cIdx,3) = mean(BaseEval.FilteredDstl);
+        result(cIdx,4) = median(BaseEval.FilteredDstl);
+    else
+        
+        
+        load(EvalName);
+
+        BaseEval.MaxDist=20; %outlier threshold of 20 mm
+        
+        BaseEval.FilteredDstl=BaseEval.Dstl(BaseEval.StlAbovePlane); %use only points that are above the plane 
+        BaseEval.FilteredDstl=BaseEval.FilteredDstl(BaseEval.FilteredDstl<BaseEval.MaxDist); % discard outliers
+    
+        BaseEval.FilteredDdata=BaseEval.Ddata(BaseEval.DataInMask); %use only points that within mask
+        BaseEval.FilteredDdata=BaseEval.FilteredDdata(BaseEval.FilteredDdata<BaseEval.MaxDist); % discard outliers
+        
+        fprintf("mean/median Data (acc.) %f/%f\n", mean(BaseEval.FilteredDdata), median(BaseEval.FilteredDdata));
+        fprintf("mean/median Stl (comp.) %f/%f\n", mean(BaseEval.FilteredDstl), median(BaseEval.FilteredDstl));
+        result(cIdx,1) = mean(BaseEval.FilteredDdata);
+        result(cIdx,2) = median(BaseEval.FilteredDdata);
+        result(cIdx,3) = mean(BaseEval.FilteredDstl);
+        result(cIdx,4) = median(BaseEval.FilteredDstl);
+    end
+end
+
+mean_result=mean(result);
+fprintf("final evaluation result on all scans: acc.: %f, comp.: %f, overall: %f\n", mean_result(1), mean_result(3), (mean_result(1)+mean_result(3))/2);
+
+
--- a/IGEV-MVS/evaluations/dtu/ComputeStat_web.m
+++ b/IGEV-MVS/evaluations/dtu/ComputeStat_web.m
@ -0,0 +1,87 @@
+clear all
+close all
+format compact
+clc
+
+% script to calculate the statistics for each scan given this will currently only run if distances have been measured
+% for all included scans (UsedSets)
+
+% modify the path to evaluate your models
+dataPath='/home/SampleSet/MVS Data/';
+resultsPath='/home/PatchmatchNet/outputs/';
+
+MaxDist=20; %outlier thresshold of 20 mm
+
+time=clock;
+
+method_string='patchmatchnet';
+light_string='l3'; %'l7'; l3 is the setting with all lights on, l7 is randomly sampled between the 7 settings (index 0-6)
+representation_string='Points'; %mvs representation 'Points' or 'Surfaces'
+
+switch representation_string
+    case 'Points'
+        eval_string='_Eval_'; %results naming
+        settings_string='';
+end
+
+% get sets used in evaluation
+UsedSets=[1 4 9 10 11 12 13 15 23 24 29 32 33 34 48 49 62 75 77 110 114 118];
+
+nStat=length(UsedSets);
+
+% struct
+BaseStat.nStl=zeros(1,nStat);
+BaseStat.nData=zeros(1,nStat);
+BaseStat.MeanStl=zeros(1,nStat);
+BaseStat.MeanData=zeros(1,nStat);
+BaseStat.VarStl=zeros(1,nStat);
+BaseStat.VarData=zeros(1,nStat);
+BaseStat.MedStl=zeros(1,nStat);
+BaseStat.MedData=zeros(1,nStat);
+
+for cStat=1:length(UsedSets) %Data set number
+    
+    currentSet=UsedSets(cStat);
+    
+    %input results name
+    EvalName=[resultsPath method_string eval_string num2str(currentSet) '.mat'];
+    
+    disp(EvalName);
+    load(EvalName);
+    
+    Dstl=BaseEval.Dstl(BaseEval.StlAbovePlane); %use only points that are above the plane 
+    Dstl=Dstl(Dstl<MaxDist); % discard outliers
+    
+    Ddata=BaseEval.Ddata(BaseEval.DataInMask); %use only points that within mask
+    Ddata=Ddata(Ddata<MaxDist); % discard outliers
+    
+    BaseStat.nStl(cStat)=length(Dstl);
+    BaseStat.nData(cStat)=length(Ddata);
+    
+    BaseStat.MeanStl(cStat)=mean(Dstl);
+    BaseStat.MeanData(cStat)=mean(Ddata);
+    
+    BaseStat.VarStl(cStat)=var(Dstl);
+    BaseStat.VarData(cStat)=var(Ddata);
+    
+    BaseStat.MedStl(cStat)=median(Dstl);
+    BaseStat.MedData(cStat)=median(Ddata);
+    
+    disp("acc");
+    disp(mean(Ddata));
+    disp("comp");
+    disp(mean(Dstl));
+    time=clock;
+end
+
+disp(BaseStat);
+disp("mean acc")
+disp(mean(BaseStat.MeanData));
+disp("mean comp")
+disp(mean(BaseStat.MeanStl));
+
+totalStatName=[resultsPath 'TotalStat_' method_string eval_string '.mat']
+save(totalStatName,'BaseStat','time','MaxDist');
+
+
+
--- a/IGEV-MVS/evaluations/dtu/MaxDistCP.m
+++ b/IGEV-MVS/evaluations/dtu/MaxDistCP.m
@ -0,0 +1,50 @@
+function Dist = MaxDistCP(Qto,Qfrom,BB,MaxDist)
+
+Dist=ones(1,size(Qfrom,2))*MaxDist;
+
+Range=floor((BB(2,:)-BB(1,:))/MaxDist);
+
+tic
+Done=0;
+LookAt=zeros(1,size(Qfrom,2));
+for x=0:Range(1),
+    for y=0:Range(2),
+        for z=0:Range(3),
+            
+            Low=BB(1,:)+[x y z]*MaxDist;
+            High=Low+MaxDist;
+            
+            idxF=find(Qfrom(1,:)>=Low(1) & Qfrom(2,:)>=Low(2) & Qfrom(3,:)>=Low(3) &...
+                Qfrom(1,:)<High(1) & Qfrom(2,:)<High(2) & Qfrom(3,:)<High(3));
+            SQfrom=Qfrom(:,idxF);
+            LookAt(idxF)=LookAt(idxF)+1; %Debug
+            
+            Low=Low-MaxDist;
+            High=High+MaxDist;
+            idxT=find(Qto(1,:)>=Low(1) & Qto(2,:)>=Low(2) & Qto(3,:)>=Low(3) &...
+                Qto(1,:)<High(1) & Qto(2,:)<High(2) & Qto(3,:)<High(3));
+            SQto=Qto(:,idxT);
+            
+            if(isempty(SQto))
+                Dist(idxF)=MaxDist;
+            else
+                KDstl=KDTreeSearcher(SQto');
+                [~,SDist] = knnsearch(KDstl,SQfrom');
+                Dist(idxF)=SDist;
+                
+            end
+            
+            Done=Done+length(idxF); %Debug
+            
+        end
+    end
+    %Complete=Done/size(Qfrom,2);
+    %EstTime=(toc/Complete)/60
+    %toc
+    %LA=[sum(LookAt==0),...
+    %	sum(LookAt==1),...
+   % 	sum(LookAt==2),...
+   % 	sum(LookAt==3),...
+   % 	sum(LookAt>3)]
+end
+
--- a/IGEV-MVS/evaluations/dtu/PointCompareMain.m
+++ b/IGEV-MVS/evaluations/dtu/PointCompareMain.m
@ -0,0 +1,58 @@
+function BaseEval=PointCompareMain(cSet,Qdata,dst,dataPath)
+% evaluation function the calculates the distantes from the reference data (stl) to the evalution points (Qdata) and the
+% distances from the evaluation points to the reference
+
+tic
+% reduce points 0.2 mm neighbourhood density
+Qdata=reducePts_haa(Qdata,dst);
+toc
+
+StlInName=[dataPath '/Points/stl/stl' sprintf('%03d',cSet) '_total.ply'];
+
+StlMesh = plyread(StlInName);  %STL points already reduced 0.2 mm neighbourhood density
+Qstl=[StlMesh.vertex.x StlMesh.vertex.y StlMesh.vertex.z]';
+
+%Load Mask (ObsMask) and Bounding box (BB) and Resolution (Res)
+Margin=10;
+MaskName=[dataPath '/ObsMask/ObsMask' num2str(cSet) '_' num2str(Margin) '.mat'];
+load(MaskName)
+
+MaxDist=60;
+disp('Computing Data 2 Stl distances')
+Ddata = MaxDistCP(Qstl,Qdata,BB,MaxDist);
+toc
+
+disp('Computing Stl 2 Data distances')
+Dstl=MaxDistCP(Qdata,Qstl,BB,MaxDist);
+disp('Distances computed')
+toc
+
+%use mask
+%From Get mask - inverted & modified.
+One=ones(1,size(Qdata,2));
+Qv=(Qdata-BB(1,:)'*One)/Res+1;
+Qv=round(Qv);
+
+Midx1=find(Qv(1,:)>0 & Qv(1,:)<=size(ObsMask,1) & Qv(2,:)>0 & Qv(2,:)<=size(ObsMask,2) & Qv(3,:)>0 & Qv(3,:)<=size(ObsMask,3));
+MidxA=sub2ind(size(ObsMask),Qv(1,Midx1),Qv(2,Midx1),Qv(3,Midx1));
+Midx2=find(ObsMask(MidxA));
+
+BaseEval.DataInMask(1:size(Qv,2))=false;
+BaseEval.DataInMask(Midx1(Midx2))=true; %If Data is within the mask
+
+BaseEval.cSet=cSet;
+BaseEval.Margin=Margin;         %Margin of masks
+BaseEval.dst=dst;               %Min dist between points when reducing
+BaseEval.Qdata=Qdata;           %Input data points
+BaseEval.Ddata=Ddata;           %distance from data to stl
+BaseEval.Qstl=Qstl;             %Input stl points
+BaseEval.Dstl=Dstl;             %Distance from the stl to data
+
+load([dataPath '/ObsMask/Plane' num2str(cSet)],'P')
+BaseEval.GroundPlane=P;         % Plane used to destinguise which Stl points are 'used'
+BaseEval.StlAbovePlane=(P'*[Qstl;ones(1,size(Qstl,2))])>0; %Is stl above 'ground plane'
+BaseEval.Time=clock;            %Time when computation is finished
+
+
+
+
--- a/IGEV-MVS/evaluations/dtu/plyread.m
+++ b/IGEV-MVS/evaluations/dtu/plyread.m
@ -0,0 +1,454 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+function [Elements,varargout] = plyread(Path,Str)
+%PLYREAD   Read a PLY 3D data file.
+%   [DATA,COMMENTS] = PLYREAD(FILENAME) reads a version 1.0 PLY file
+%   FILENAME and returns a structure DATA.  The fields in this structure
+%   are defined by the PLY header; each element type is a field and each
+%   element property is a subfield.  If the file contains any comments,
+%   they are returned in a cell string array COMMENTS.
+%
+%   [TRI,PTS] = PLYREAD(FILENAME,'tri') or
+%   [TRI,PTS,DATA,COMMENTS] = PLYREAD(FILENAME,'tri') converts vertex
+%   and face data into triangular connectivity and vertex arrays.  The
+%   mesh can then be displayed using the TRISURF command.
+%
+%   Note: This function is slow for large mesh files (+50K faces),
+%   especially when reading data with list type properties.
+%
+%   Example:
+%   [Tri,Pts] = PLYREAD('cow.ply','tri');
+%   trisurf(Tri,Pts(:,1),Pts(:,2),Pts(:,3)); 
+%   colormap(gray); axis equal;
+%
+%   See also: PLYWRITE
+
+% Pascal Getreuer 2004
+
+[fid,Msg] = fopen(Path,'rt');	% open file in read text mode
+
+if fid == -1, error(Msg); end
+
+Buf = fscanf(fid,'%s',1);
+if ~strcmp(Buf,'ply')
+   fclose(fid);
+   error('Not a PLY file.'); 
+end
+
+
+%%% read header %%%
+
+Position = ftell(fid);
+Format = '';
+NumComments = 0;
+Comments = {};				% for storing any file comments
+NumElements = 0;
+NumProperties = 0;
+Elements = [];				% structure for holding the element data
+ElementCount = [];		% number of each type of element in file
+PropertyTypes = [];		% corresponding structure recording property types
+ElementNames = {};		% list of element names in the order they are stored in the file
+PropertyNames = [];		% structure of lists of property names
+
+while 1
+   Buf = fgetl(fid);   								% read one line from file
+   BufRem = Buf;
+   Token = {};
+   Count = 0;
+   
+   while ~isempty(BufRem)								% split line into tokens
+      [tmp,BufRem] = strtok(BufRem);
+      
+      if ~isempty(tmp)
+         Count = Count + 1;							% count tokens
+         Token{Count} = tmp;
+      end
+   end
+   
+   if Count 		% parse line
+      switch lower(Token{1})
+      case 'format'		% read data format
+         if Count >= 2
+            Format = lower(Token{2});
+            
+            if Count == 3 & ~strcmp(Token{3},'1.0')
+               fclose(fid);
+               error('Only PLY format version 1.0 supported.');
+            end
+         end
+      case 'comment'		% read file comment
+         NumComments = NumComments + 1;
+         Comments{NumComments} = '';
+         for i = 2:Count
+            Comments{NumComments} = [Comments{NumComments},Token{i},' '];
+         end
+      case 'element'		% element name
+         if Count >= 3
+            if isfield(Elements,Token{2})
+               fclose(fid);
+               error(['Duplicate element name, ''',Token{2},'''.']);
+            end
+            
+            NumElements = NumElements + 1;
+            NumProperties = 0;
+   	      Elements = setfield(Elements,Token{2},[]);
+            PropertyTypes = setfield(PropertyTypes,Token{2},[]);
+            ElementNames{NumElements} = Token{2};
+            PropertyNames = setfield(PropertyNames,Token{2},{});
+            CurElement = Token{2};
+            ElementCount(NumElements) = str2double(Token{3});
+            
+            if isnan(ElementCount(NumElements))
+               fclose(fid);
+               error(['Bad element definition: ',Buf]); 
+            end            
+         else
+            error(['Bad element definition: ',Buf]);
+         end         
+      case 'property'	% element property
+         if ~isempty(CurElement) & Count >= 3            
+            NumProperties = NumProperties + 1;
+            eval(['tmp=isfield(Elements.',CurElement,',Token{Count});'],...
+               'fclose(fid);error([''Error reading property: '',Buf])');
+            
+            if tmp
+               error(['Duplicate property name, ''',CurElement,'.',Token{2},'''.']);
+            end            
+            
+            % add property subfield to Elements
+            eval(['Elements.',CurElement,'.',Token{Count},'=[];'], ...
+               'fclose(fid);error([''Error reading property: '',Buf])');            
+            % add property subfield to PropertyTypes and save type
+            eval(['PropertyTypes.',CurElement,'.',Token{Count},'={Token{2:Count-1}};'], ...
+               'fclose(fid);error([''Error reading property: '',Buf])');            
+            % record property name order 
+            eval(['PropertyNames.',CurElement,'{NumProperties}=Token{Count};'], ...
+               'fclose(fid);error([''Error reading property: '',Buf])');
+         else
+            fclose(fid);
+            
+            if isempty(CurElement)            
+               error(['Property definition without element definition: ',Buf]);
+            else               
+               error(['Bad property definition: ',Buf]);
+            end            
+         end         
+      case 'end_header'	% end of header, break from while loop
+         break;		
+      end
+   end
+end
+
+%%% set reading for specified data format %%%
+
+if isempty(Format)
+	warning('Data format unspecified, assuming ASCII.');
+   Format = 'ascii';
+end
+
+switch Format
+case 'ascii'
+   Format = 0;
+case 'binary_little_endian'
+   Format = 1;
+case 'binary_big_endian'
+   Format = 2;
+otherwise
+   fclose(fid);
+   error(['Data format ''',Format,''' not supported.']);
+end
+
+if ~Format   
+   Buf = fscanf(fid,'%f');		% read the rest of the file as ASCII data
+   BufOff = 1;
+else
+   % reopen the file in read binary mode
+   fclose(fid);
+   
+   if Format == 1
+      fid = fopen(Path,'r','ieee-le.l64');		% little endian
+   else
+      fid = fopen(Path,'r','ieee-be.l64');		% big endian
+   end
+   
+   % find the end of the header again (using ftell on the old handle doesn't give the correct position)   
+   BufSize = 8192;
+   Buf = [blanks(10),char(fread(fid,BufSize,'uchar')')];
+   i = [];
+   tmp = -11;
+   
+   while isempty(i)
+   	i = findstr(Buf,['end_header',13,10]);			% look for end_header + CR/LF
+   	i = [i,findstr(Buf,['end_header',10])];		% look for end_header + LF
+      
+      if isempty(i)
+         tmp = tmp + BufSize;
+         Buf = [Buf(BufSize+1:BufSize+10),char(fread(fid,BufSize,'uchar')')];
+      end
+   end
+   
+   % seek to just after the line feed
+   fseek(fid,i + tmp + 11 + (Buf(i + 10) == 13),-1);
+end
+
+
+%%% read element data %%%
+
+% PLY and MATLAB data types (for fread)
+PlyTypeNames = {'char','uchar','short','ushort','int','uint','float','double', ...
+   'char8','uchar8','short16','ushort16','int32','uint32','float32','double64'};
+MatlabTypeNames = {'schar','uchar','int16','uint16','int32','uint32','single','double'};
+SizeOf = [1,1,2,2,4,4,4,8];	% size in bytes of each type
+
+for i = 1:NumElements
+   % get current element property information
+   eval(['CurPropertyNames=PropertyNames.',ElementNames{i},';']);
+   eval(['CurPropertyTypes=PropertyTypes.',ElementNames{i},';']);
+   NumProperties = size(CurPropertyNames,2);
+   
+%   fprintf('Reading %s...\n',ElementNames{i});
+      
+   if ~Format	%%% read ASCII data %%%
+      for j = 1:NumProperties
+         Token = getfield(CurPropertyTypes,CurPropertyNames{j});
+         
+         if strcmpi(Token{1},'list')
+            Type(j) = 1;
+         else
+            Type(j) = 0;
+			end
+      end
+      
+      % parse buffer
+      if ~any(Type)
+         % no list types
+         Data = reshape(Buf(BufOff:BufOff+ElementCount(i)*NumProperties-1),NumProperties,ElementCount(i))';
+         BufOff = BufOff + ElementCount(i)*NumProperties;
+      else
+         ListData = cell(NumProperties,1);
+         
+         for k = 1:NumProperties
+            ListData{k} = cell(ElementCount(i),1);
+         end
+         
+         % list type
+		   for j = 1:ElementCount(i)
+   	      for k = 1:NumProperties
+      	      if ~Type(k)
+         	      Data(j,k) = Buf(BufOff);
+            	   BufOff = BufOff + 1;
+	            else
+   	            tmp = Buf(BufOff);
+      	         ListData{k}{j} = Buf(BufOff+(1:tmp))';
+         	      BufOff = BufOff + tmp + 1;
+            	end
+            end
+         end
+      end
+   else		%%% read binary data %%%
+      % translate PLY data type names to MATLAB data type names
+      ListFlag = 0;		% = 1 if there is a list type 
+      SameFlag = 1;     % = 1 if all types are the same
+      
+      for j = 1:NumProperties
+         Token = getfield(CurPropertyTypes,CurPropertyNames{j});
+         
+         if ~strcmp(Token{1},'list')			% non-list type
+	         tmp = rem(strmatch(Token{1},PlyTypeNames,'exact')-1,8)+1;
+         
+            if ~isempty(tmp)
+               TypeSize(j) = SizeOf(tmp);
+               Type{j} = MatlabTypeNames{tmp};
+               TypeSize2(j) = 0;
+               Type2{j} = '';
+               
+               SameFlag = SameFlag & strcmp(Type{1},Type{j});
+	         else
+   	         fclose(fid);
+               error(['Unknown property data type, ''',Token{1},''', in ', ...
+                     ElementNames{i},'.',CurPropertyNames{j},'.']);
+         	end
+         else											% list type
+            if length(Token) == 3
+               ListFlag = 1;
+               SameFlag = 0;
+               tmp = rem(strmatch(Token{2},PlyTypeNames,'exact')-1,8)+1;
+               tmp2 = rem(strmatch(Token{3},PlyTypeNames,'exact')-1,8)+1;
+         
+               if ~isempty(tmp) & ~isempty(tmp2)
+                  TypeSize(j) = SizeOf(tmp);
+                  Type{j} = MatlabTypeNames{tmp};
+                  TypeSize2(j) = SizeOf(tmp2);
+                  Type2{j} = MatlabTypeNames{tmp2};
+	   	      else
+   	   	      fclose(fid);
+               	error(['Unknown property data type, ''list ',Token{2},' ',Token{3},''', in ', ...
+                        ElementNames{i},'.',CurPropertyNames{j},'.']);
+               end
+            else
+               fclose(fid);
+               error(['Invalid list syntax in ',ElementNames{i},'.',CurPropertyNames{j},'.']);
+            end
+         end
+      end
+      
+      % read file
+      if ~ListFlag
+         if SameFlag
+            % no list types, all the same type (fast)
+            Data = fread(fid,[NumProperties,ElementCount(i)],Type{1})';
+         else
+            % no list types, mixed type
+            Data = zeros(ElementCount(i),NumProperties);
+            
+         	for j = 1:ElementCount(i)
+        			for k = 1:NumProperties
+               	Data(j,k) = fread(fid,1,Type{k});
+              	end
+         	end
+         end
+      else
+         ListData = cell(NumProperties,1);
+         
+         for k = 1:NumProperties
+            ListData{k} = cell(ElementCount(i),1);
+         end
+         
+         if NumProperties == 1
+            BufSize = 512;
+            SkipNum = 4;
+            j = 0;
+            
+            % list type, one property (fast if lists are usually the same length)
+            while j < ElementCount(i)
+               Position = ftell(fid);
+               % read in BufSize count values, assuming all counts = SkipNum
+               [Buf,BufSize] = fread(fid,BufSize,Type{1},SkipNum*TypeSize2(1));
+               Miss = find(Buf ~= SkipNum);					% find first count that is not SkipNum
+               fseek(fid,Position + TypeSize(1),-1); 		% seek back to after first count                              
+               
+               if isempty(Miss)									% all counts are SkipNum
+                  Buf = fread(fid,[SkipNum,BufSize],[int2str(SkipNum),'*',Type2{1}],TypeSize(1))';
+                  fseek(fid,-TypeSize(1),0); 				% undo last skip
+                  
+                  for k = 1:BufSize
+                     ListData{1}{j+k} = Buf(k,:);
+                  end
+                  
+                  j = j + BufSize;
+                  BufSize = floor(1.5*BufSize);
+               else
+                  if Miss(1) > 1									% some counts are SkipNum
+                     Buf2 = fread(fid,[SkipNum,Miss(1)-1],[int2str(SkipNum),'*',Type2{1}],TypeSize(1))';                     
+                     
+                     for k = 1:Miss(1)-1
+                        ListData{1}{j+k} = Buf2(k,:);
+                     end
+                     
+                     j = j + k;
+                  end
+                  
+                  % read in the list with the missed count
+                  SkipNum = Buf(Miss(1));
+                  j = j + 1;
+                  ListData{1}{j} = fread(fid,[1,SkipNum],Type2{1});
+                  BufSize = ceil(0.6*BufSize);
+               end
+            end
+         else
+            % list type(s), multiple properties (slow)
+            Data = zeros(ElementCount(i),NumProperties);
+            
+            for j = 1:ElementCount(i)
+         		for k = 1:NumProperties
+            		if isempty(Type2{k})
+               		Data(j,k) = fread(fid,1,Type{k});
+            		else
+               		tmp = fread(fid,1,Type{k});
+               		ListData{k}{j} = fread(fid,[1,tmp],Type2{k});
+		            end
+      		   end
+      		end
+         end
+      end
+   end
+   
+   % put data into Elements structure
+   for k = 1:NumProperties
+   	if (~Format & ~Type(k)) | (Format & isempty(Type2{k}))
+      	eval(['Elements.',ElementNames{i},'.',CurPropertyNames{k},'=Data(:,k);']);
+      else
+      	eval(['Elements.',ElementNames{i},'.',CurPropertyNames{k},'=ListData{k};']);
+		end
+   end
+end
+
+clear Data ListData;
+fclose(fid);
+
+if (nargin > 1 & strcmpi(Str,'Tri')) | nargout > 2   
+   % find vertex element field
+   Name = {'vertex','Vertex','point','Point','pts','Pts'};
+   Names = [];
+   
+   for i = 1:length(Name)
+      if any(strcmp(ElementNames,Name{i}))
+         Names = getfield(PropertyNames,Name{i});
+         Name = Name{i};         
+         break;
+      end
+   end
+   
+   if any(strcmp(Names,'x')) & any(strcmp(Names,'y')) & any(strcmp(Names,'z'))
+      eval(['varargout{1}=[Elements.',Name,'.x,Elements.',Name,'.y,Elements.',Name,'.z];']);
+   else
+      varargout{1} = zeros(1,3);
+	end
+           
+   varargout{2} = Elements;
+   varargout{3} = Comments;
+   Elements = [];
+   
+   % find face element field
+   Name = {'face','Face','poly','Poly','tri','Tri'};
+   Names = [];
+   
+   for i = 1:length(Name)
+      if any(strcmp(ElementNames,Name{i}))
+         Names = getfield(PropertyNames,Name{i});
+         Name = Name{i};
+         break;
+      end
+   end
+   
+   if ~isempty(Names)
+      % find vertex indices property subfield
+	   PropertyName = {'vertex_indices','vertex_indexes','vertex_index','indices','indexes'};           
+      
+   	for i = 1:length(PropertyName)
+      	if any(strcmp(Names,PropertyName{i}))
+         	PropertyName = PropertyName{i};
+	         break;
+   	   end
+      end
+      
+      if ~iscell(PropertyName)
+         % convert face index lists to triangular connectivity
+         eval(['FaceIndices=varargout{2}.',Name,'.',PropertyName,';']);
+  			N = length(FaceIndices);
+   		Elements = zeros(N*2,3);
+   		Extra = 0;   
+
+			for k = 1:N
+   			Elements(k,:) = FaceIndices{k}(1:3);
+   
+   			for j = 4:length(FaceIndices{k})
+      			Extra = Extra + 1;      
+	      		Elements(N + Extra,:) = [Elements(k,[1,j-1]),FaceIndices{k}(j)];
+   			end
+         end
+         Elements = Elements(1:N+Extra,:) + 1;
+      end
+   end
+else
+   varargout{1} = Comments;
+end
--- a/IGEV-MVS/evaluations/dtu/reducePts_haa.m
+++ b/IGEV-MVS/evaluations/dtu/reducePts_haa.m
@ -0,0 +1,35 @@
+function [ptsOut,indexSet] = reducePts_haa(pts, dst)
+
+%Reduces a point set, pts, in a stochastic manner, such that the minimum sdistance
+% between points is 'dst'. Writen by abd, edited by haa, then by raje
+
+nPoints=size(pts,2);
+
+indexSet=true(nPoints,1);
+RandOrd=randperm(nPoints);
+
+%tic
+NS = KDTreeSearcher(pts');
+%toc
+
+% search the KNTree for close neighbours in a chunk-wise fashion to save memory if point cloud is really big
+Chunks=1:min(4e6,nPoints-1):nPoints;
+Chunks(end)=nPoints;
+
+for cChunk=1:(length(Chunks)-1)
+    Range=Chunks(cChunk):Chunks(cChunk+1);
+    
+    idx = rangesearch(NS,pts(:,RandOrd(Range))',dst);
+    
+    for i = 1:size(idx,1)
+        id =RandOrd(i-1+Chunks(cChunk));
+        if (indexSet(id))
+            indexSet(idx{i}) = 0;
+            indexSet(id) = 1;
+        end
+    end
+end
+
+ptsOut = pts(:,indexSet);
+
+disp(['downsample factor: ' num2str(nPoints/sum(indexSet))]);
--- a/IGEV-MVS/lists/blendedmvs/train.txt
+++ b/IGEV-MVS/lists/blendedmvs/train.txt
@ -0,0 +1,106 @@
+5c1f33f1d33e1f2e4aa6dda4
+5bfe5ae0fe0ea555e6a969ca
+5bff3c5cfe0ea555e6bcbf3a
+58eaf1513353456af3a1682a
+5bfc9d5aec61ca1dd69132a2
+5bf18642c50e6f7f8bdbd492
+5bf26cbbd43923194854b270
+5bf17c0fd439231948355385
+5be3ae47f44e235bdbbc9771
+5be3a5fb8cfdd56947f6b67c
+5bbb6eb2ea1cfa39f1af7e0c
+5ba75d79d76ffa2c86cf2f05
+5bb7a08aea1cfa39f1a947ab
+5b864d850d072a699b32f4ae
+5b6eff8b67b396324c5b2672
+5b6e716d67b396324c2d77cb
+5b69cc0cb44b61786eb959bf
+5b62647143840965efc0dbde
+5b60fa0c764f146feef84df0
+5b558a928bbfb62204e77ba2
+5b271079e0878c3816dacca4
+5b08286b2775267d5b0634ba
+5afacb69ab00705d0cefdd5b
+5af28cea59bc705737003253
+5af02e904c8216544b4ab5a2
+5aa515e613d42d091d29d300
+5c34529873a8df509ae57b58
+5c34300a73a8df509add216d
+5c1af2e2bee9a723c963d019
+5c1892f726173c3a09ea9aeb
+5c0d13b795da9479e12e2ee9
+5c062d84a96e33018ff6f0a6
+5bfd0f32ec61ca1dd69dc77b
+5bf21799d43923194842c001
+5bf3a82cd439231948877aed
+5bf03590d4392319481971dc
+5beb6e66abd34c35e18e66b9
+5be883a4f98cee15019d5b83
+5be47bf9b18881428d8fbc1d
+5bcf979a6d5f586b95c258cd
+5bce7ac9ca24970bce4934b6
+5bb8a49aea1cfa39f1aa7f75
+5b78e57afc8fcf6781d0c3ba
+5b21e18c58e2823a67a10dd8
+5b22269758e2823a67a3bd03
+5b192eb2170cf166458ff886
+5ae2e9c5fe405c5076abc6b2
+5adc6bd52430a05ecb2ffb85
+5ab8b8e029f5351f7f2ccf59
+5abc2506b53b042ead637d86
+5ab85f1dac4291329b17cb50
+5a969eea91dfc339a9a3ad2c
+5a8aa0fab18050187cbe060e
+5a7d3db14989e929563eb153
+5a69c47d0d5d0a7f3b2e9752
+5a618c72784780334bc1972d
+5a6464143d809f1d8208c43c
+5a588a8193ac3d233f77fbca
+5a57542f333d180827dfc132
+5a572fd9fc597b0478a81d14
+5a563183425d0f5186314855
+5a4a38dad38c8a075495b5d2
+5a48d4b2c7dab83a7d7b9851
+5a489fb1c7dab83a7d7b1070
+5a48ba95c7dab83a7d7b44ed
+5a3ca9cb270f0e3f14d0eddb
+5a3cb4e4270f0e3f14d12f43
+5a3f4aba5889373fbbc5d3b5
+5a0271884e62597cdee0d0eb
+59e864b2a9e91f2c5529325f
+599aa591d5b41f366fed0d58
+59350ca084b7f26bf5ce6eb8
+59338e76772c3e6384afbb15
+5c20ca3a0843bc542d94e3e2
+5c1dbf200843bc542d8ef8c4
+5c1b1500bee9a723c96c3e78
+5bea87f4abd34c35e1860ab5
+5c2b3ed5e611832e8aed46bf
+57f8d9bbe73f6760f10e916a
+5bf7d63575c26f32dbf7413b
+5be4ab93870d330ff2dce134
+5bd43b4ba6b28b1ee86b92dd
+5bccd6beca24970bce448134
+5bc5f0e896b66a2cd8f9bd36
+5b908d3dc6ab78485f3d24a9
+5b2c67b5e0878c381608b8d8
+5b4933abf2b5f44e95de482a
+5b3b353d8d46a939f93524b9
+5acf8ca0f3d8a750097e4b15
+5ab8713ba3799a1d138bd69a
+5aa235f64a17b335eeaf9609
+5aa0f9d7a9efce63548c69a1
+5a8315f624b8e938486e0bd8
+5a48c4e9c7dab83a7d7b5cc7
+59ecfd02e225f6492d20fcc9
+59f87d0bfa6280566fb38c9a
+59f363a8b45be22330016cad
+59f70ab1e5c5d366af29bf3e
+59e75a2ca9e91f2c5526005d
+5947719bf1b45630bd096665
+5947b62af1b45630bd0c2a02
+59056e6760bb961de55f3501
+58f7f7299f5b5647873cb110
+58cf4771d0f5fb221defe6da
+58d36897f387231e6c929903
+58c4bb4f4a69c55606122be4
--- a/IGEV-MVS/lists/blendedmvs/val.txt
+++ b/IGEV-MVS/lists/blendedmvs/val.txt
@ -0,0 +1,7 @@
+5b7a3890fc8fcf6781e2593a
+5c189f2326173c3a09ed7ef3
+5b950c71608de421b1e7318f
+5a6400933d809f1d8200af15
+59d2657f82ca7774b1ec081d
+5ba19a8a360c7c30c1c169df
+59817e4a1bd4b175e7038d19
--- a/IGEV-MVS/lists/dtu/test.txt
+++ b/IGEV-MVS/lists/dtu/test.txt
@ -0,0 +1,22 @@
+scan1
+scan4
+scan9
+scan10
+scan11
+scan12
+scan13
+scan15
+scan23
+scan24
+scan29
+scan32
+scan33
+scan34
+scan48
+scan49
+scan62
+scan75
+scan77
+scan110
+scan114
+scan118
--- a/IGEV-MVS/lists/dtu/train.txt
+++ b/IGEV-MVS/lists/dtu/train.txt
@ -0,0 +1,79 @@
+scan2
+scan6
+scan7
+scan8
+scan14
+scan16
+scan18
+scan19
+scan20
+scan22
+scan30
+scan31
+scan36
+scan39
+scan41
+scan42
+scan44
+scan45
+scan46
+scan47
+scan50
+scan51
+scan52
+scan53
+scan55
+scan57
+scan58
+scan60
+scan61
+scan63
+scan64
+scan65
+scan68
+scan69
+scan70
+scan71
+scan72
+scan74
+scan76
+scan83
+scan84
+scan85
+scan87
+scan88
+scan89
+scan90
+scan91
+scan92
+scan93
+scan94
+scan95
+scan96
+scan97
+scan98
+scan99
+scan100
+scan101
+scan102
+scan103
+scan104
+scan105
+scan107
+scan108
+scan109
+scan111
+scan112
+scan113
+scan115
+scan116
+scan119
+scan120
+scan121
+scan122
+scan123
+scan124
+scan125
+scan126
+scan127
+scan128
--- a/IGEV-MVS/lists/dtu/val.txt
+++ b/IGEV-MVS/lists/dtu/val.txt
@ -0,0 +1,18 @@
+scan3
+scan5
+scan17
+scan21
+scan28
+scan35
+scan37
+scan38
+scan40
+scan43
+scan56
+scan59
+scan66
+scan67
+scan82
+scan86
+scan106
+scan117
--- a/IGEV-MVS/train_mvs.py
+++ b/IGEV-MVS/train_mvs.py
@ -0,0 +1,293 @@
+import argparse
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+import numpy as np
+import random
+import time
+from torch.utils.tensorboard import SummaryWriter
+from datasets import find_dataset_def
+from core.igev_mvs import IGEVMVS
+from core.submodule import depth_normalization, depth_unnormalization
+from utils import *
+import sys
+import datetime
+from tqdm import tqdm
+
+cudnn.benchmark = True
+
+
+
+parser = argparse.ArgumentParser(description='IterMVStereo for high-resolution multi-view stereo')
+parser.add_argument('--mode', default='train', help='train or val', choices=['train', 'val'])
+
+parser.add_argument('--dataset', default='dtu_yao', help='select dataset')
+parser.add_argument('--trainpath', default='/data/dtu_data/dtu_train/', help='train datapath')
+parser.add_argument('--valpath', help='validation datapath')
+parser.add_argument('--trainlist', default='./lists/dtu/train.txt', help='train list')
+parser.add_argument('--vallist', default='./lists/dtu/val.txt', help='validation list')
+parser.add_argument('--maxdisp', default=256)
+
+parser.add_argument('--epochs', type=int, default=32, help='number of epochs to train')
+parser.add_argument('--lr', type=float, default=0.0002, help='learning rate')
+parser.add_argument('--wd', type=float, default=.00001, help='weight decay')
+
+parser.add_argument('--batch_size', type=int, default=6, help='train batch size')
+parser.add_argument('--loadckpt', default=None, help='load a specific checkpoint')
+parser.add_argument('--logdir', default='./checkpoints/', help='the directory to save checkpoints/logs')
+parser.add_argument('--resume', action='store_true', help='continue to train the model')
+parser.add_argument('--regress', action='store_true', help='train the regression and confidence')
+parser.add_argument('--small_image', action='store_true', help='train with small input as 640x512, otherwise train with 1280x1024')
+
+parser.add_argument('--summary_freq', type=int, default=20, help='print and summary frequency')
+parser.add_argument('--save_freq', type=int, default=1, help='save checkpoint frequency')
+parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed')
+parser.add_argument('--iteration', type=int, default=22, help='num of iteration of GRU')
+
+try:
+    from torch.cuda.amp import GradScaler
+except:
+    # dummy GradScaler for PyTorch < 1.6
+    class GradScaler:
+        def __init__(self):
+            pass
+        def scale(self, loss):
+            return loss
+        def unscale_(self, optimizer):
+            pass
+        def step(self, optimizer):
+            optimizer.step()
+        def update(self):
+            pass
+
+def sequence_loss(disp_preds, disp_init_pred, depth_gt, mask, depth_min, depth_max, loss_gamma=0.9):
+    """ Loss function defined over sequence of depth predictions """
+    cross_entropy = nn.BCEWithLogitsLoss()
+    n_predictions = len(disp_preds)
+    assert n_predictions >= 1
+    loss = 0.0
+    mask = mask > 0.5
+    batch, _, height, width = depth_gt.size()
+    inverse_depth_min = (1.0 / depth_min).view(batch, 1, 1, 1)
+    inverse_depth_max = (1.0 / depth_max).view(batch, 1, 1, 1)
+
+    normalized_disp_gt = depth_normalization(depth_gt, inverse_depth_min, inverse_depth_max)
+    loss += 1.0 * F.l1_loss(disp_init_pred[mask], normalized_disp_gt[mask], reduction='mean')
+
+    if args.iteration != 0:
+        for i in range(n_predictions):
+            adjusted_loss_gamma = loss_gamma**(15/(n_predictions - 1))
+            i_weight = adjusted_loss_gamma**(n_predictions - i - 1)
+            loss += i_weight * F.l1_loss(disp_preds[i][mask], normalized_disp_gt[mask], reduction='mean')
+
+    return loss
+
+# parse arguments and check
+args = parser.parse_args()
+if args.resume: # store_true means set the variable as "True"
+    assert args.mode == "train"
+    assert args.loadckpt is None
+if args.valpath is None:
+    args.valpath = args.trainpath
+
+torch.manual_seed(args.seed)
+torch.cuda.manual_seed(args.seed)
+np.random.seed(args.seed)
+random.seed(args.seed)
+
+if args.mode == "train":
+    if not os.path.isdir(args.logdir):
+        os.mkdir(args.logdir)
+
+    current_time_str = str(datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
+    print("current time", current_time_str)
+
+    print("creating new summary file")
+    logger = SummaryWriter(args.logdir)
+
+print("argv:", sys.argv[1:])
+print_args(args)
+
+# dataset, dataloader
+MVSDataset = find_dataset_def(args.dataset)
+
+train_dataset = MVSDataset(args.trainpath, args.trainlist, "train", 5, robust_train=True)
+test_dataset = MVSDataset(args.valpath, args.vallist, "val", 5,  robust_train=False)
+
+TrainImgLoader = DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=4, drop_last=True)
+TestImgLoader = DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=4, drop_last=False)
+
+# model, optimizer
+model = IGEVMVS(args)
+if args.mode in ["train", "val"]:
+    model = nn.DataParallel(model)
+model.cuda()
+model_loss = sequence_loss
+optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd, eps=1e-8)
+
+# load parameters
+start_epoch = 0
+if (args.mode == "train" and args.resume) or (args.mode == "val" and not args.loadckpt):
+    saved_models = [fn for fn in os.listdir(args.logdir) if fn.endswith(".ckpt")]
+    saved_models = sorted(saved_models, key=lambda x: int(x.split('_')[-1].split('.')[0]))
+    # use the latest checkpoint file
+    loadckpt = os.path.join(args.logdir, saved_models[-1])
+    print("resuming", loadckpt)
+    state_dict = torch.load(loadckpt)
+    model.load_state_dict(state_dict['model'], strict=False)
+    optimizer.load_state_dict(state_dict['optimizer'])
+    start_epoch = state_dict['epoch'] + 1
+elif args.loadckpt:
+    # load checkpoint file specified by args.loadckpt
+    print("loading model {}".format(args.loadckpt))
+    state_dict = torch.load(args.loadckpt)
+    model.load_state_dict(state_dict['model'], strict=False)
+print("start at epoch {}".format(start_epoch))
+print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])))
+
+
+# main function
+def train(args):
+    total_steps = len(TrainImgLoader) * args.epochs + 100
+    lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, args.lr, total_steps, pct_start=0.01, cycle_momentum=False, anneal_strategy='linear')
+
+    for epoch_idx in range(start_epoch, args.epochs):
+        print('Epoch {}:'.format(epoch_idx))
+        global_step = len(TrainImgLoader) * epoch_idx
+
+        # training
+        tbar = tqdm(TrainImgLoader)
+        for batch_idx, sample in enumerate(tbar):
+            start_time = time.time()
+            global_step = len(TrainImgLoader) * epoch_idx + batch_idx
+            do_summary = global_step % args.summary_freq == 0
+            scaler = GradScaler(enabled=True)
+            loss, scalar_outputs = train_sample(args, sample, detailed_summary=do_summary, scaler=scaler)
+            if do_summary:
+                save_scalars(logger, 'train', scalar_outputs, global_step)
+            del scalar_outputs
+
+            tbar.set_description(
+                'Epoch {}/{}, Iter {}/{}, train loss = {:.3f}, time = {:.3f}'.format(epoch_idx, args.epochs, batch_idx, len(TrainImgLoader), loss, time.time() - start_time))
+
+        lr_scheduler.step()
+
+        # checkpoint
+        if (epoch_idx + 1) % args.save_freq == 0:
+            torch.save({
+                'model': model.state_dict()},
+                "{}/model_{:0>6}.ckpt".format(args.logdir, epoch_idx))
+        torch.cuda.empty_cache()
+        # testing
+        avg_test_scalars = DictAverageMeter()
+        tbar = tqdm(TestImgLoader)
+        for batch_idx, sample in enumerate(tbar):
+            start_time = time.time()
+            global_step = len(TestImgLoader) * epoch_idx + batch_idx
+            do_summary = global_step % args.summary_freq == 0
+            loss, scalar_outputs = test_sample(args, sample, detailed_summary=do_summary)
+            if do_summary:
+                save_scalars(logger, 'test', scalar_outputs, global_step)
+            avg_test_scalars.update(scalar_outputs)
+            del scalar_outputs
+            tbar.set_description('Epoch {}/{}, Iter {}/{}, test loss = {:.3f}, time = {:3f}'.format(epoch_idx, args.epochs, batch_idx,
+                                                                                     len(TestImgLoader), loss,
+                                                                                     time.time() - start_time))
+        save_scalars(logger, 'fulltest', avg_test_scalars.mean(), global_step)
+        print("avg_test_scalars:", avg_test_scalars.mean())
+        torch.cuda.empty_cache()
+
+def test(args):
+    avg_test_scalars = DictAverageMeter()
+    for batch_idx, sample in enumerate(TestImgLoader):
+        start_time = time.time()
+        loss, scalar_outputs = test_sample(args, sample, detailed_summary=True)
+        avg_test_scalars.update(scalar_outputs)
+        del scalar_outputs
+        print('Iter {}/{}, test loss = {:.3f}, time = {:3f}'.format(batch_idx, len(TestImgLoader), loss,
+                                                                    time.time() - start_time))
+        if batch_idx % 100 == 0:
+            print("Iter {}/{}, test results = {}".format(batch_idx, len(TestImgLoader), avg_test_scalars.mean()))
+    print("final", avg_test_scalars)
+
+
+def train_sample(args, sample, detailed_summary=False, scaler=None):
+    model.train()
+    optimizer.zero_grad()
+    sample_cuda = tocuda(sample)
+    depth_gt = sample_cuda["depth"] 
+    mask = sample_cuda["mask"]      
+    depth_gt_0 = depth_gt['level_0']
+    mask_0 = mask['level_0']
+    depth_gt_1 = depth_gt['level_2']
+    mask_1 = mask['level_2']
+
+    disp_init, disp_predictions = model(sample_cuda["imgs"], sample_cuda["proj_matrices"],
+                    sample_cuda["depth_min"], sample_cuda["depth_max"])
+
+    loss = model_loss(disp_predictions, disp_init, depth_gt_0, mask_0, sample_cuda["depth_min"], sample_cuda["depth_max"])
+    scaler.scale(loss).backward()
+    scaler.unscale_(optimizer)
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+    scaler.step(optimizer)
+    scaler.update()
+
+    inverse_depth_min = (1.0 / sample_cuda["depth_min"]).view(args.batch_size, 1, 1, 1)
+    inverse_depth_max = (1.0 / sample_cuda["depth_max"]).view(args.batch_size, 1, 1, 1)
+    depth_init = depth_unnormalization(disp_init, inverse_depth_min, inverse_depth_max)
+
+    depth_predictions = []
+    for disp in disp_predictions:
+        depth_predictions.append(depth_unnormalization(disp, inverse_depth_min, inverse_depth_max))
+
+    scalar_outputs = {"loss": loss}
+    scalar_outputs["abs_error_initial"] = AbsDepthError_metrics(depth_init, depth_gt_0, mask_0 > 0.5)
+    scalar_outputs["thres1mm_initial"] = Thres_metrics(depth_init, depth_gt_0, mask_0 > 0.5, 1)
+    scalar_outputs["abs_error_final_full"] = AbsDepthError_metrics(depth_predictions[-1], depth_gt_0, mask_0 > 0.5)
+    return tensor2float(loss), tensor2float(scalar_outputs)
+
+
+@make_nograd_func
+def test_sample(args, sample, detailed_summary=True):
+    model.eval()
+    sample_cuda = tocuda(sample)
+    depth_gt = sample_cuda["depth"] 
+    mask = sample_cuda["mask"]      
+    depth_gt_0 = depth_gt['level_0']
+    mask_0 = mask['level_0']
+    depth_gt_1 = depth_gt['level_2']
+    mask_1 = mask['level_2']
+
+    disp_init, disp_predictions = model(sample_cuda["imgs"], sample_cuda["proj_matrices"],
+                    sample_cuda["depth_min"], sample_cuda["depth_max"])
+
+    loss = model_loss(disp_predictions, disp_init, depth_gt_0, mask_0, sample_cuda["depth_min"], sample_cuda["depth_max"])
+
+
+    inverse_depth_min = (1.0 / sample_cuda["depth_min"]).view(sample_cuda["depth_min"].size()[0], 1, 1, 1)
+    inverse_depth_max = (1.0 / sample_cuda["depth_max"]).view(sample_cuda["depth_max"].size()[0], 1, 1, 1)
+    depth_init = depth_unnormalization(disp_init, inverse_depth_min, inverse_depth_max)
+
+    depth_predictions = []
+    for disp in disp_predictions:
+        depth_predictions.append(depth_unnormalization(disp, inverse_depth_min, inverse_depth_max))
+
+    scalar_outputs = {"loss": loss}
+    scalar_outputs["abs_error_initial"] = AbsDepthError_metrics(depth_init, depth_gt_0, mask_0 > 0.5)
+    scalar_outputs["thres1mm_initial"] = Thres_metrics(depth_init, depth_gt_0, mask_0 > 0.5, 1)
+    scalar_outputs["abs_error_final_full"] = AbsDepthError_metrics(depth_predictions[-1], depth_gt_0, mask_0 > 0.5)
+    return tensor2float(loss), tensor2float(scalar_outputs)
+
+
+if __name__ == '__main__':
+    if args.mode == "train":
+        train(args)
+    elif args.mode == "val":
+        test(args)
+    
--- a/IGEV-MVS/utils.py
+++ b/IGEV-MVS/utils.py
@ -0,0 +1,155 @@
+import numpy as np
+import torchvision.utils as vutils
+import torch
+import torch.nn.functional as F
+
+
+# print arguments
+def print_args(args):
+    print("################################  args  ################################")
+    for k, v in args.__dict__.items():
+        print("{0: <10}\t{1: <30}\t{2: <20}".format(k, str(v), str(type(v))))
+    print("########################################################################")
+
+
+# torch.no_grad warpper for functions
+def make_nograd_func(func):
+    def wrapper(*f_args, **f_kwargs):
+        with torch.no_grad():
+            ret = func(*f_args, **f_kwargs)
+        return ret
+
+    return wrapper
+
+
+# convert a function into recursive style to handle nested dict/list/tuple variables
+def make_recursive_func(func):
+    def wrapper(vars):
+        if isinstance(vars, list):
+            return [wrapper(x) for x in vars]
+        elif isinstance(vars, tuple):
+            return tuple([wrapper(x) for x in vars])
+        elif isinstance(vars, dict):
+            return {k: wrapper(v) for k, v in vars.items()}
+        else:
+            return func(vars)
+
+    return wrapper
+
+
+@make_recursive_func
+def tensor2float(vars):
+    if isinstance(vars, float):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.data.item()
+    else:
+        raise NotImplementedError("invalid input type {} for tensor2float".format(type(vars)))
+
+
+@make_recursive_func
+def tensor2numpy(vars):
+    if isinstance(vars, np.ndarray):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.detach().cpu().numpy().copy()
+    else:
+        raise NotImplementedError("invalid input type {} for tensor2numpy".format(type(vars)))
+
+
+@make_recursive_func
+def tocuda(vars):
+    if isinstance(vars, torch.Tensor):
+        return vars.cuda()
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError("invalid input type {} for tocuda".format(type(vars)))
+
+
+def save_scalars(logger, mode, scalar_dict, global_step):
+    scalar_dict = tensor2float(scalar_dict)
+    for key, value in scalar_dict.items():
+        if not isinstance(value, (list, tuple)):
+            name = '{}/{}'.format(mode, key)
+            logger.add_scalar(name, value, global_step)
+        else:
+            for idx in range(len(value)):
+                name = '{}/{}_{}'.format(mode, key, idx)
+                logger.add_scalar(name, value[idx], global_step)
+
+
+def save_images(logger, mode, images_dict, global_step):
+    images_dict = tensor2numpy(images_dict)
+
+    def preprocess(name, img):
+        if not (len(img.shape) == 3 or len(img.shape) == 4):
+            raise NotImplementedError("invalid img shape {}:{} in save_images".format(name, img.shape))
+        if len(img.shape) == 3:
+            img = img[:, np.newaxis, :, :]
+        img = torch.from_numpy(img[:1])
+        return vutils.make_grid(img, padding=0, nrow=1, normalize=True, scale_each=True)
+
+    for key, value in images_dict.items():
+        if not isinstance(value, (list, tuple)):
+            name = '{}/{}'.format(mode, key)
+            logger.add_image(name, preprocess(name, value), global_step)
+        else:
+            for idx in range(len(value)):
+                name = '{}/{}_{}'.format(mode, key, idx)
+                logger.add_image(name, preprocess(name, value[idx]), global_step)
+
+
+class DictAverageMeter(object):
+    def __init__(self):
+        self.data = {}
+        self.count = 0
+
+    def update(self, new_input):
+        self.count += 1
+        if len(self.data) == 0:
+            for k, v in new_input.items():
+                if not isinstance(v, float):
+                    raise NotImplementedError("invalid data {}: {}".format(k, type(v)))
+                self.data[k] = v
+        else:
+            for k, v in new_input.items():
+                if not isinstance(v, float):
+                    raise NotImplementedError("invalid data {}: {}".format(k, type(v)))
+                self.data[k] += v
+
+    def mean(self):
+        return {k: v / self.count for k, v in self.data.items()}
+
+
+# a wrapper to compute metrics for each image individually
+def compute_metrics_for_each_image(metric_func):
+    def wrapper(depth_est, depth_gt, mask, *args):
+        batch_size = depth_gt.shape[0]
+        results = []
+        # compute result one by one
+        for idx in range(batch_size):
+            ret = metric_func(depth_est[idx], depth_gt[idx], mask[idx], *args)
+            results.append(ret)
+        return torch.stack(results).mean()
+
+    return wrapper
+
+
+@make_nograd_func
+@compute_metrics_for_each_image
+def Thres_metrics(depth_est, depth_gt, mask, thres):
+    # if thres is int or float, then True
+    assert isinstance(thres, (int, float))
+    depth_est, depth_gt = depth_est[mask], depth_gt[mask]
+    errors = torch.abs(depth_est - depth_gt)
+    err_mask = errors > thres
+    return torch.mean(err_mask.float())
+
+
+# NOTE: please do not use this to build up training loss
+@make_nograd_func
+@compute_metrics_for_each_image
+def AbsDepthError_metrics(depth_est, depth_gt, mask):
+    depth_est, depth_gt = depth_est[mask], depth_gt[mask]
+    return torch.mean((depth_est - depth_gt).abs())