MultiViewRetrieve/index.html at master · EmergentSystemLabStudent/MultiViewRetrieve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="SimView">
  <meta property="og:title" content="SimView"/>
  <meta property="og:description" content="Project page for Object Instance Retrieval in Assistive Robotics: Leveraging Fine-Tuned SimSiam with Multi-View Images Based on 3D Semantic Map."/>
  <meta property="og:url" content="https://emergentsystemlabstudent.github.io/MultiViewRetrieve/"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="Project page for Object Instance Retrieval in Assistive Robotics: Leveraging Fine-Tuned SimSiam with Multi-View Images Based on 3D Semantic Map.">
  <meta name="twitter:description" content="Project page for Object Instance Retrieval in Assistive Robotics: Leveraging Fine-Tuned SimSiam with Multi-View Images Based on 3D Semantic Map.">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>SimView</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
  <script src="static/js/carousel.js"></script>
</head>

<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">Object Instance Retrieval in Assistive Robotics: Leveraging Fine-Tuned SimSiam with Multi-View Images Based on 3D Semantic Map</h1>
          <div class="is-size-5 publication-authors">
            <!-- Paper authors -->
            <span class="author-block">
              Taichi Sakaguchi<sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.co.jp/citations?user=jtB7J0AAAAAJ&hl=ja&oi=ao" target="_blank">Akira Taniguchi<sup>1,*</sup>,</a></span>
            <span class="author-block">
              <a href="https://scholar.google.co.jp/citations?user=Y4qjYvMAAAAJ&hl=ja&oi=ao" target="_blank">Yoshinobu Hagiwara<sup>1,2</sup>,</a></span>
            <span class="author-block">
              <a href="https://scholar.google.co.jp/citations?user=tsm7qaQAAAAJ&hl=en&oi=ao" target="_blank">Lotfi El Hafi<sup>1</sup>,</span>
            <br>
            <span class="author-block">
              <a href="https://scholar.google.co.jp/citations?user=KPxSCJUAAAAJ&hl=ja&oi=ao" target="_blank">Shoichi Hasegawa<sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.co.jp/citations?user=dPOCLQEAAAAJ&hl=ja&oi=ao" target="_blank">Tadahiro Taniguchi<sup>1,3</sup></a></span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <sup>1</sup>Ritsumeikan University,
              <sup>2</sup>Soka University,
              <sup>3</sup>Kyoto University
              <br><span class="publication-awards">Accepted at IEEE/RSJ IROS 2024</span>
            </span>
            <span class="eql-cntrb"><small><br><sup>*</sup>Corresponding Author</small></span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- Arxiv PDF link -->
              <span class="link-block">
                <a href="https://arxiv.org/abs/2404.09647" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>
                    Paper
                  </span>
                </a>
              </span>

              <!-- Slide link -->
              <span class="link-block">
                <a href="" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Slide (coming soon)</span>
                </a>
              </span>

              <!-- Github link -->
              <span class="link-block">
                <a href="https://github.com/EmergentSystemLabStudent/MultiViewRetrieve" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                </a>
              </span>

              <!-- ArXiv abstract Link -->
              <!-- <span class="link-block">
                <a href="https://arxiv.org/abs/<ARXIV PAPER ID>" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span> -->

            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Teaser video-->
<section class="hero teaser">
  <div class="container is-max-desktop" style="text-align: center;"> <!-- 動画を中央揃え -->
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop
             style="width: 80%; height: auto; display: block; margin: 0 auto;">
        <source src="./static/videos/IROS24_2251_VI_i.mp4" type="video/mp4">
      </video>
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Robots that assist in daily life are required to locate specific instances of objects that match the user's desired object in the environment.
            This task is known as Instance-Specific Image Goal Navigation (InstanceImageNav), which requires a model capable of distinguishing between different instances within the same class.
            One significant challenge in robotics is that when a robot observes the same object from various 3D viewpoints, its appearance may differ greatly, making it difficult to recognize and locate the object accurately.
            In this study, we introduce a method, <b>SimView</b>, that leverages multi-view images based on a 3D semantic map of the environment and self-supervised learning by SimSiam to train an instance identification model on-site.
            The effectiveness of our approach is validated using a photorealistic simulator, Habitat Matterport 3D, created by scanning real home environments.
            Our results demonstrate a 1.7-fold improvement in task accuracy compared to CLIP, which is pre-trained multimodal contrastive learning for object search.
            This improvement highlights the benefits of our proposed fine-tuning method in enhancing the performance of assistive robots in InstanceImageNav tasks.  </p>
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Overview</h2>
      <!-- <div id="results-carousel" class="carousel results-carousel"> -->
        <div class="item" style="text-align: center;">
          <img src="./static/images/overview.jpg" alt="image of model" style="width: 70%; height: auto; display: block; margin: 0 auto;"/>
          <h2 class="subtitle" style="text-align: left; margin-top: 20px;">
            Focused task in this study.
            (Top) The robot identifies the position of an object shown in a query image provided by a user's mobile phone.
            (Bottom left) Domain gap that the image quality significantly differs between the image taken by the user's mobile phone and the object image observed by the real robot.
            (Bottom right) Contrastive learning to align images of the same instance with different image quality in latent space.
          </h2>
        </div>
        <!-- <div class="item">
          <img src="static/images/carousel2.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Second image description.
          </h2>
        </div>
        <div class="item">
          <img src="static/images/carousel3.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Third image description.
          </h2>
        </div>
        <div class="item">
          <img src="static/images/carousel4.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Fourth image description.
          </h2>
        </div> -->
      <!-- </div> -->
    </div>
  </div>
</section>
<!-- End image carousel -->

<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">SimView</h2>
      <!-- <div id="results-carousel" class="carousel results-carousel"> -->
        <div class="item" style="text-align: center;">
          <img src="./static/images/proposed_method.svg" alt="image of model" style="width: 70%; height: auto; display: block; margin: 0 auto;"/>
          <h2 class="subtitle" style="text-align: left; margin-top: 20px;">
            In the proposed system, a robot explores the environment, identifies the instance identical to a given query image from among the collected object images, and uses a 3D semantic map of the environment to locate the target object's position.
            In addition, we propose a method, <b>Semantic Instance Multi-view Contrastive Fine-tuning (SimView)</b>, for fine-tuning pre-trained models using a self-supervised learning framework to improve task accuracy in the environment.
            Figure 1 shows the diagram of our proposed system.
          </h2>
        </div>
        <!-- <div class="item">
          <img src="static/images/carousel2.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Second image description.
          </h2>
        </div>
        <div class="item">
          <img src="static/images/carousel3.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Third image description.
          </h2>
        </div>
        <div class="item">
          <img src="static/images/carousel4.jpg" alt="MY ALT TEXT"/>
          <h2 class="subtitle has-text-centered">
            Fourth image description.
          </h2>
        </div> -->
      <!-- </div> -->
    </div>
  </div>
</section>
<!-- End image carousel -->

<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Self-Supervised Fine-tuning Module</h2>
      <div class="item" style="text-align: center;"> <!-- 画像を中央揃え -->
        <img src="./static/images/fine_tuning.svg" alt="image of model"
             style="width: 70%; height: auto; display: block; margin: 0 auto;"/>
        <h2 class="subtitle" style="text-align: left; margin-top: 20px;">
          This module fine-tuned the image encoder, which was pre-trained by contrastive learning using self-supervised learning, object images observed by the robot while exploring the environment, and their pseudo-labels.
          When a robot explores the environment and observes objects, images of the same instance include images observed from various angles of view.
          In a preliminary experiment, we confirmed that when fine-tuning a pre-trained model using only contrastive learning on such a dataset, the accuracy of discrimination between instances is worse than that of the trained model.
          Therefore, we propose a method to train a linear classifier simultaneously with contrastive learning.
          We use object instance ID \( y_{true} \) obtained from a 3D semantic map of the robot's environment as pseudo labels.
          In addition, the contrastive learning method using negative pairs is recommended to be trained with a very large batch size and requires a large amount of data for learning.
          Then, to conduct fine-tuning, it is necessary to continue exploring the environment for a long time and collecting images of objects.
          Therefore, we use SimSiam for fine-tuning, which allows learning even with a small batch size.
        </h2>
      </div>
    </div>
  </div>
</section>
<!-- End image carousel -->

<!-- Paper poster -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title">Poster</h2>

      <iframe  src="./static/pdf/IROS_Poster.pdf" width="100%" height="550">
        </iframe>

    </div>
  </div>
</section>
<!--End paper poster -->


<!--BibTex citation -->
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <!-- <pre><code>BibTex Code Here</code></pre> -->
    <pre><code>
      @inproceedings{sakaguchi2024simview,
        author={Sakaguchi, Taichi  and Taniguchi, Akira and Hagiwara, Yoshinobu  and El Hafi, Lotfi  and Hasegawa, Shoichi and Taniguchi, Tadahiro},
        title={Object Instance Retrieval in Assistive Robotics: Leveraging Fine-Tuned SimSiam with Multi-View Images Based on 3D Semantic Map},
        booktitle={IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
        year={2024, in press}
      }
    </code></pre>
  </div>
</section>
<!--End BibTex citation -->

<!--Related Research -->
<section class="section" id="Related Research">
  <div class="container is-max-desktop content">
    <h2 class="title">Related Research</h2>
    <ul>
      <li><a href="https://emergentsystemlabstudent.github.io/DomainBridgingNav/" target="_blank" rel="noopener noreferrer">CrossIA</a></li>
      <li><a href="https://tomochika-ishikawa.github.io/Active-SpCoSLAM/" target="_blank" rel="noopener noreferrer">Active SpCoSLAM</a></li>
      <li><a href="https://emergentsystemlabstudent.github.io/ECRAP/" target="_blank" rel="noopener noreferrer">ECRAP</a></li>
      <li><a href="https://emergentsystemlabstudent.github.io/PointingImgEst/" target="_blank" rel="noopener noreferrer">PointingImgEst</a></li>
    </ul>
  </div>
</section>
<!--End Related Research -->

<!--Laboratory Information -->
<section class="section" id="Laboratory Information">
  <div class="container is-max-desktop content">
    <h2 class="title">Laboratory Information</h2>
    <ul>
      <li><a href="http://www.em.ci.ritsumei.ac.jp/" target="_blank" rel="noopener noreferrer">Emergent Systems Laboratory</a></li>
      <li><a href="https://www.youtube.com/@tarosouhatsu494/videos" target="_blank" rel="noopener noreferrer">Demonstration Videos of the Laboratory</a></li>
    </ul>
  </div>
</section>
<!--End Laboratory Information -->

<!--Acknowledgements citation -->
<section class="section" id="Acknowledgements">
  <div class="container is-max-desktop content">
    <!-- <h2 class="title">Acknowledgements</h2> -->
    <h2 class="title">Funding</h2>
    <p>
      This work was supported by JSPS KAKENHI Grants-in-Aid for Scientific Research (Grant Numbers JP23K16975, 22K12212) and JST Moonshot Research & Development Program (Grant Number JPMJMS2011).
    </p>
  </div>
</section>
<!--End Acknowledgements citation -->


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->

<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

<!-- End of Statcounter Code -->

  </body>
  </html>