import techimg_01 from 'assets/imgs/techblog_img_01.jpg';

import tech0fig1 from 'assets/imgs/tech-0-fig1.png';
import tech0fig2 from 'assets/imgs/tech-0-fig2.png';
import tech0fig3 from 'assets/imgs/tech-0-fig3.png';
import tech0fig4 from 'assets/imgs/tech-0-fig4.png';
import tech0fig5 from 'assets/imgs/tech-0-fig5.png';
import tech0fig6 from 'assets/imgs/tech-0-fig6.png';
import tech0fig7 from 'assets/imgs/tech-0-fig7.png';

import Tech1Thumb from 'assets/imgs/tech1-thumb-0.png';
import tech1fig1 from 'assets/imgs/tech-1-fig0.png';
import tech1fig2 from 'assets/imgs/tech-1-fig1.png';
import tech1fig3 from 'assets/imgs/tech-1-fig2.png';
import tech1fig4 from 'assets/imgs/tech-1-fig3.png';
import tech1fig5 from 'assets/imgs/tech-1-fig4.png';
import tech1fig6 from 'assets/imgs/tech-1-fig5.png';

import techblog2Thumb from 'assets/imgs/techblog3-main.jpg';
import techblog2fig1 from 'assets/imgs/techblog3-fig1.png';
import techblog2fig2 from 'assets/imgs/techblog3-fig2.png';
// import techblog2fig3 from 'assets/imgs/techblog3-table3.png';
import techblog2table1 from 'assets/imgs/techblog3-fig3-1.png';
import techblog2table2 from 'assets/imgs/techblog3-fig3-2.png';
import techblog2table3 from 'assets/imgs/techblog3-fig3-3.png';
import techblog2fig4 from 'assets/imgs/techblog3-fig4.png';
import techblog2fig5 from 'assets/imgs/techblog3-fig5.png';

import techblog3Thumb from 'assets/imgs/techblog4thumb.png';
import techblog3fig1 from 'assets/imgs/techblog4fig1.png';
import techblog3table1 from 'assets/imgs/techblog4table1.png';
import techblog3fig2 from 'assets/imgs/techblog4fig2.png';

import JongHwa from 'assets/imgs/jonghwa.png';
import SeHun from 'assets/imgs/img-profile-tech-sehun2x.png';
import SeokKyu from 'assets/imgs/img-profile-tech-seokkyu2x.png';
import JaeYoung from 'assets/imgs/jaeyoung.png';
import { addComma } from 'utils';

export const techBlog = [
  {
    key: 0,
    thumbnail: Tech1Thumb,
    title: `(Paper Review) Lightweight and Effective Facial Landmark Detection using Adversarial Learning with Face Geometric Map Generative Network`,
    subtitle: [
      '',
      // `(Paper Review) Lightweight and Effective Facial Landmark Detection using Adversarial Learning with Face Geometric Map Generative Network`,
    ],
    subscription: 'Face Detection & Facial Landmark Detection',
    time: 'Jan 21, 2021',
    author_thumbnail: SeokKyu,
    author: 'Seok Kyu Choi',
    position: 'AI Scientist',
    originalLink: 'https://blog.genesislab.ai/?p=12611',
    category: ['Detection'],
    description: [
      {
        type: 'text',
        content:
          'This paper was published by KAIST IVY Lab. and Genesis Lab from the IITP(Institute of Information & Communications Technology Planning & Evaluation) 2018 ICT R&D Voucher project.',
      },
      { type: 'title', content: 'Original Paper' },
      {
        type: 'link',
        content:
          'Lightweight and Effective Facial Landmark Detection using Adversarial Learning with Face Geometric Map Generative Network',
        link: 'https://ieeexplore.ieee.org/document/8633862',
      },

      { type: 'title', content: 'Facial Landmark' },
      {
        type: 'text',
        content:
          'Facial Landmarks are features that represent key elements that make up a face (eyes, eyebrows, nose, mouth, and jawline). There are usually 68 feature points and they are used to find faces inside of an image. This is why Facial Landmark Detection is very important when trying to find faces elaborately. The facial landmarks detected through the algorithm is used not only to detect faces but also in various fields of computer vision, such as head posture estimation and emotion recognition.',
      },
      {
        type: 'img',
        content: tech1fig1,
      },
      {
        type: 'title',
        content: 'Introduction',
      },
      {
        type: 'text',
        content:
          'Facial Landmark Detection is a task of localizing facial key components which provide essential information for computer vision task. In general, there are two FLD methods. Optimization-based methods, which predicts directional movement to fit the facial model to the given face image, and regression-based methods, where it directly predicts the position of a landmark point using learned parameters. Recently, deep learning-based methods have shown better performances. There have been numerous new studies on different methods, as deep learning-based research grows in popularity, such as the multi-task learning method which tries to simultaneously solve tasks such as face detection and head pose estimation. The facial landmark detection research must be applicable to mobile or web applications, so the model needs to be simple as well as accurate. However, when using a simple CNN structure, the performance from images with misaligned facial contour is not that satisfying. A study using two sub-networks that predicts the inner components and the contour of the face each, alleviated the problem — but still hard to say that it is solved.',
      },
      {
        type: 'text',
        content:
          'This paper proposes a Geometric Prior-Generative Adversarial Network based on GAN that learns through adversarial mini-max game between generator and discriminator. The proposed model uses adversarial and face geometric loss to train, unlike conventional methods that just use L1 or L2 loss to learn the difference between ground truth facial landmarks and predicted landmarks. In the paper, a generator is trained to predict the facial inner geometric map and facial contour geometric map, through the output value of the trained Encoder, which is trained to predict the coordinates of the face landmark from the face image. In addition, the Discriminators are designed to learn to distinguish between ground truth facial landmarks and a predicted facial landmarks by a generating model.',
      },

      {
        type: 'title',
        content: 'Face Geometry Generative Adversarial Network',
      },
      {
        type: 'title',
        content: 'Model Overview',
      },
      { type: 'img', content: tech1fig2 },
      {
        type: 'fig-desc',
        content: `Fig. 2. The overview of proposed face geometry GAN for facial landmark detection. (a) shows the facial landmark estimator, (b) shows the facial inner geometric map generator, (c) shows facial contour geometric map generator, (d) shows facial inner geometric map discriminator, and (e) shows facial contour geometric map discriminator. In generator, two adversarial geometric maps (inner and contour) are generated. The generator and the discriminators are trained through adversarial mini-max game. The estimator predicts facial inner/contour landmarks. Then the discriminators determine whether the geometric maps
          are real/fake and predict each facial landmark. Note that binary maps are deployed as face geometric maps as shown in figure.`,
      },
      {
        type: 'title',
        content: 'Training Face Geometric Map Generator',
      },
      {
        type: 'text',
        content: `The generator composed of one encoder that predicts facial inner and contour landmarks from an input image, and two decoders that generate a geometric map from the output values of the encoder. The geometric face features are vital to accurately predict landmarks for images containing various noise, such as cropped or angled faces. The previous method, the L1 and L2 method that only considers the difference between the actual and predicted values, does not take these features into account. However, in this paper, the encoder in the generator predicts the inner face and contour landmarks respectively, and the decoder in the generator utilizes them to create a geometric map. When training the generator, the adversarial loss function and the prediction loss function of the discriminator is also taken into account, but the discriminator’s parameters are fixed.`,
      },
      { type: 'img', content: tech1fig3 },
      {
        type: 'fig-desc',
        content: `Fig. 3. (a) shows the dice coefficient used for facial geometry evaluation. (b)
        shows facial geometry match and dice coefficients evaluating generated facial
        geometric maps for given ground truth facial geometric maps.`,
      },
      {
        type: 'title',
        content: 'Training Discriminator',
      },
      {
        type: 'text',
        content: `Each discriminator is trained to determine whether the inputted geometric map is real or generated, and to predict facial landmark as well. The loss function that is calculated during the training of generators is included in among the discriminator loss functions. The generator is trained to generate more realistic geometric maps, since the parameters of the discriminant function are not updated during generator training, in order to minimize loss. Similarly, the generator parameters are fixed during the discriminator training`,
      },
      {
        type: 'title',
        content: 'Experiement Result',
      },

      {
        type: 'data',
        content: [
          ['Name', 'Train', 'Test', 'Argumentation Data'],
          ['HELEN DATASET', addComma(2000), addComma(330), addComma(24000)],
          ['300-W DATASET', addComma(3148), addComma(689), addComma(40082)],
        ],
      },
      {
        type: 'fig-desc',
        content:
          'The datasets used in the experiment are HELEN and 300-W. HELEN has two types of annotations: one is 194 landmarks and the other is 68 landmarks. 300-W consists of 4 subsets ( AFW,LFPW,HELEN,IBUG), 3,148 training data (AFW:377+,HELEN:2,000+LFPW:811), 689 test data(LFPW:224+,HELEN:300+,IBUG:135). Also, data augmentation such as translation, rotation, and magnifications were conducted.',
      },
      {
        type: 'title',
        content: 'Experiments for Performance Comparison',
      },
      { type: 'img', content: tech1fig4 },
      {
        type: 'fig-desc',
        content:
          'If you look at the table above, you can see that it performs better than the existing methods. It performs better than TCDCN which was pretrained through MAFL Database, and RCFA that uses RNN for face alignment.',
      },
      { type: 'img', content: tech1fig5 },
      {
        type: 'fig-desc',
        content:
          'In the table above, the model proposed by this paper shows the best performance even in the 300-W, which contains extremely challenging images, so it could be considered to be more robust than other models.',
      },

      {
        type: 'title',
        content: 'Experiment Results for Usefulness of Contour Map',
      },
      {
        type: 'text',
        content:
          'In this paper, they verified the effectiveness of contour geometric maps through experiments.',
      },
      { type: 'img', content: tech1fig6 },
      {
        type: 'fig-desc',
        content:
          'The graph above is an experiment with three models, which have the same structure but different training methods. CNN8 is a model optimized with L1 and Contour Geometric,8 is a model trained without Facial Inners considered. As the experiment result above shows, the method using facial contour performs better than the one using the general L1 loss method, and this is further improved when facial inners are used.',
      },
      {
        type: 'title',
        content: 'Conclusion',
      },
      {
        type: 'text',
        content:
          'This paper presented that adversarial learning using geometric facial information is better than existing methods in FLD. The facial contour geometric map helps inner facial landmark points to be localized within the correct facial contour region. During the test stage, the landmarks could be extracted using the encoder only, so it has achieved the goal of being a simple yet effective FLD network that can be applied to various applications.',
      },
      {
        type: 'title',
        content: 'References',
      },
      {
        type: 'text',
        content: `[2] A. Asthana, S. Zafeiriou, S. Cheng, and M. Pantic, “Robust discriminative response map fitting with constrained local models,” in Proceedings
            of the IEEE conference on computer vision and pattern recognition,
            2013, pp. 3444–3451.`,
      },
      {
        type: 'text',
        content: `[5] Z. Zhang, P. Luo, C. C. Loy, and X. Tang, “Learning deep representation
          for face alignment with auxiliary attributes,” IEEE transactions on
          pattern analysis and machine intelligence, vol. 38, no. 5, pp. 918–930,
          2016.`,
      },
      {
        type: 'text',
        content: `[6] J. Lv, X. Shao, J. Xing, C. Cheng, X. Zhou et al., “A deep regression
          architecture with two-stage re-initialization for high performance facial
          landmark detection,” in The IEEE Conference on Computer Vision and
          Pattern Recognition (CVPR), vol. 7, 2017.`,
      },
      {
        type: 'text',
        content: `[7] X. Cao, Y. Wei, F. Wen, and J. Sun, “Face alignment by explicit shape
          regression,” International Journal of Computer Vision, vol. 107, no. 2,
          pp. 177–190, 2014.`,
      },

      {
        type: 'text',
        content: `[13] J. Zhang, S. Shan, M. Kan, and X. Chen, “Coarse-to-fine auto-encoder
          networks (cfan) for real-time face alignment,” in European Conference
          on Computer Vision. Springer, 2014, pp. 1–16.`,
      },
      {
        type: 'text',
        content: `[14] C. Wang, H. Sun, J. Lu, J. Feng, and J. Zhou, “Multiscale recurrent
          regression networks for face alignment,” in Applied Informatics, vol. 4,
          no. 1. SpringerOpen, 2017, p. 13.`,
      },
      {
        type: 'text',
        content: `[24] S. Zhu, C. Li, C. Change Loy, and X. Tang, “Face alignment by coarseto-fine shape searching,” in Proceedings of the IEEE Conference on
          Computer Vision and Pattern Recognition, 2015, pp. 4998–5006.`,
      },
      {
        type: 'text',
        content: `[25] X. Xiong and F. De la Torre, “Supervised descent method and its
          applications to face alignment,” in Proceedings of the IEEE conference
          on computer vision and pattern recognition, 2013, pp. 532–539.`,
      },

      {
        type: 'text',
        content: `[26] X. P. Burgos-Artizzu, P. Perona, and P. Dollar, “Robust face landmark ´
          estimation under occlusion,” in Proceedings of the IEEE International
          Conference on Computer Vision, 2013, pp. 1513–1520.`,
      },
      {
        type: 'text',
        content: `[27] G. Tzimiropoulos and M. Pantic, “Gauss-newton deformable part models
          for face alignment in-the-wild,” in Proceedings of the IEEE Conference
          on Computer Vision and Pattern Recognition, 2014, pp. 1851–1858.`,
      },
      {
        type: 'text',
        content: `[28] W. Wang, S. Tulyakov, and N. Sebe, “Recurrent convolutional face
          alignment,” in Asian Conference on Computer Vision. Springer, 2016,
          pp. 104–120.`,
      },
      {
        type: 'text',
        content: `[29] S. Ren, X. Cao, Y. Wei, and J. Sun, “Face alignment at 3000 fps via
          regressing local binary features,” in Proceedings of the IEEE Conference
          on Computer Vision and Pattern Recognition, 2014, pp. 1685–1692.`,
      },
      {
        type: 'text',
        content: `[30] X. Xu and I. A. Kakadiaris, “Joint head pose estimation and face
          alignment framework using global and local cnn features,” in Automatic
          Face & Gesture Recognition (FG 2017), 2017 12th IEEE International
          Conference on. IEEE, 2017, pp. 642–649.`,
      },
      {
        type: 'text',
        content: `[31] G. Trigeorgis, P. Snape, M. A. Nicolaou, E. Antonakos, and S. Zafeiriou,
          “Mnemonic descent method: A recurrent process applied for end-to-end
          face alignment,” in Proceedings of the IEEE Conference on Computer
          Vision and Pattern Recognition, 2016, pp. 4177–4187.`,
      },
      {
        type: 'text',
        content: `[32] Q. Hou, J. Wang, R. Bai, S. Zhou, and Y. Gong, “Face alignment
          recurrent network,” Pattern Recognition, vol. 74, pp. 448–458, 2018.`,
      },
      {
        type: 'text',
        content: `[33] H. Zhang, Q. Li, Z. Sun, and Y. Liu, “Combining data-driven and modeldriven methods for robust facial landmark detection,” IEEE Transactions
          on Information Forensics and Security, vol. 13, no. 10, pp. 2409–2422,
          2018.`,
      },
      {
        type: 'text',
        content: `[34] H. Lai, S. Xiao, Y. Pan, Z. Cui, J. Feng, C. Xu, J. Yin, and S. Yan, “Deep
          recurrent regression for facial landmark detection,” IEEE Transactions
          on Circuits and Systems for Video Technology, 2016.`,
      },
      {
        type: 'text',
        content: `[35] M. Kowalski, J. Naruniec, and T. Trzcinski, “Deep alignment network: A
          convolutional neural network for robust face alignment,” in Proceedings
          of the International Conference on Computer Vision & Pattern Recognition (CVPRW), Faces-in-the-wild Workshop/Challenge, vol. 3, no. 5,
          2017, p. 6.`,
      },
      {
        type: 'text',
        content: `[36] H. Liu, J. Lu, J. Feng, and J. Zhou, “Learning deep sharable and
          structural detectors for face alignment,” IEEE Transactions on Image
          Processing, vol. 26, no. 4, pp. 1666–1678, 2017.`,
      },
    ],
  },
  {
    key: 1,
    thumbnail: techimg_01,
    title: `(Paper Review) Age-gender estimation`,
    subtitle: [
      `Face analysis plays an important role in face-centric applications. This paper introduces a technology that, from facial images, predicts age and gender simultaneously.`,
    ],
    time: 'Jan 19, 2021',
    subscription: 'Age / Gender Estimation',
    originalLink: 'https://blog.genesislab.ai/?p=12652',
    category: ['Detection'],
    author_thumbnail: SeHun,
    author: 'Se Hun Kim',
    position: 'AI Scientist',
    description: [
      {
        type: 'text',
        content:
          'This paper was published by KAIST IVY Lab. and Genesis Lab from the IITP(Institute of Information & Communications Technology Planning & Evaluation) 2018 ICT R&D Voucher project.',
      },
      { type: 'title', content: 'Original Paper' },
      {
        type: 'link',
        content:
          'Adversarial Spatial Frequency Domain Critic Learning for Age and Gender Classification',
        link: 'https://ieeexplore.ieee.org/document/8451616',
      },
      { type: 'title', content: 'Proposed method' },
      {
        type: 'text',
        content:
          'The main idea in this paper is to synthesize the age and gender, dominantly revealed by spatial frequency domain, into a generated image. Another technique practiced was calculating the loss by repeatedly alternating learning the age and gender. Details of that are as follows:',
      },
      {
        type: 'img',
        content: tech0fig1,
      },
      {
        type: 'title',
        content: '1. Encoder-Generator',
      },
      {
        type: 'text',
        content:
          'The encoder-generator is similar to DCGAN. The encoder extracts features from the real image input from the CNN network, and the generator creates a fake image using the input values. The difference from DCGAN, however, is that here the output value of the encoder is synthesized with the label of age and gender to be used as the input value of the generator. The generator receives the age and gender and attempts to create a face image with those two pieces of information taken into account.',
      },
      {
        type: 'title',
        content: '2. Adversarial Spatial Frequency Domain Critic',
      },
      {
        type: 'text',
        content: `Adversarial spatial frequency domain critic plays the role of maintaining age and gender characteristics while reducing the noise and identifying the appearance of the generated image.
            The public data sets shown in Fig. 2. (a) and (b) were classified by age and gender and then calculated for the average. Then when the gradient of the CNN activation was studied, different areas had been activated. The activation classified by age as shown in Fig. 2. (c) and (d), the texture such as wrinkles stood out, while in the activation classified by gender, the landmarks around the face such as the nose, eyes, and mouth stood out.`,
      },
      { type: 'img', content: tech0fig2 },
      { type: 'fig-desc', content: 'Fig. 2. Average image and activation by class' },
      {
        type: 'text',
        content:
          'The characteristic values of age and gender, as shown in the images (a) and (b) in Fig. 3 which are obtained by multiplying the activation of each classification and the Fourier transformed images, have revealed dominantly in different spatial frequency domains.',
      },
      { type: 'img', content: tech0fig3 },
      {
        type: 'fig-desc',
        content:
          'Fig. 3. (a) shows the results after multiplying the spatial frequency. Fig. 2 (a) and Fig. 2. (c), (b) shows the results after multiplying the spatial frequency in Fig. 2. (b) and Fig. 2. (d)',
      },
      {
        type: 'additional',
        content:
          'Fourier transform is a mathematical transform that decomposes functions depending on space or time into functions depending on spatial or temporal frequency. It is also used to filter for specific characteristics by selecting only the desired frequency.',
      },
      {
        type: 'text',
        content:
          'We use this characteristic of spatial frequency to preserve the characteristics of age and gender while reducing other feature characteristics by creating a mask. Different masks are created for age and gender, as they reduce other feature characteristics by multiplying 1 in the spatial frequency domain where age and gender are prominent, while multiplying a constant between 1 and 0 for other spatial frequency domains.',
      },
      { type: 'img', content: tech0fig4 },
      { type: 'fig-desc', content: 'Equation 1. critic mask formula' },
      { type: 'img', content: tech0fig5 },
      { type: 'fig-desc', content: 'Equation 2. critic loss function formula' },
      { type: 'title', content: '3. Discriminator for multi-task classification' },
      {
        type: 'text',
        content:
          'In this paper, the proposed discriminator plays two roles. It, similar to the GAN, screens the authenticity of the image, and classify age and gender. The loss function is calculated by role, and age and gender are also calculated separately. Age is classified into 8 classes using the cross-entropy loss function.',
      },
      { type: 'title', content: '4. Alternating learning' },
      {
        type: 'text',
        content: `The learning to classify age and gender takes place on the same network, but it takes alternately. As seen in Algorithm 1., the encoder-generator learns to reduce the 'loss for encoder-generator' and 'critic loss for gender.' The learning — for encoder-generator, critics, and discriminator — takes place first for gender, and then the same learning for age proceeds. As seen above alternating learning is repeated for every epoch.`,
      },

      { type: 'img', content: tech0fig6 },
      { type: 'fig-desc', content: 'Algorithm 1.' },
      { type: 'title', content: 'Experiment results' },
      {
        type: 'text',
        content:
          'The experiment was conducted using Adience benchmark and LFW dataset. The results, from comparing handcraft-based methods and the CNN-based method, showed that the method using masks introduced in this paper showed higher accuracy than any other method. Even without the use of the mask method introduced in this paper, the classification of age showed superior accuracy.',
      },
      { type: 'img', content: tech0fig7 },
      { type: 'title', content: 'Conclusion' },
      {
        type: 'text',
        content:
          'We have confirmed that the proposed spatial frequency domain critic network, and the alternating learning strategy performed better than any other method in classifying age and gender. The generated image created from filtering specific regions of spatial frequency domain preserved age and gender information better, and the ability to classify age and gender was improved further by the alternate learning strategies.',
      },
      { type: 'title', content: 'References' },
      {
        type: 'text',
        content: `[5] E. Eidinger, R. Enbar, T. Hassner, “Age and Gender
            Estimation of Unfiltered Faces,” Trans. On Inform, Forensics
            and Security, 2014.`,
      },
      {
        type: 'text',
        content: `[11] G. Levi, T. Hassner. “Age and gender classification using
          convolutional neural networks,” IEEE Conference on
          Computer Vision and Pattern Recognition Workshops, pp.34-
          42, 2015.`,
      },
      {
        type: 'text',
        content: `[12] H. L. Hsieh, W. Hsu, Y. Y. Chen, “Multi-task learning for
          face identification and attribute estimation,” IEEE
          International Conference on Acoustic, Speech and Signal
          Processing, 2017.`,
      },
      {
        type: 'text',
        content: `[20] T. Hassner, S. Harel, E. Paz, R. Enbar, “Effective face
          frontalization in unconstrained images,” in Proceedings of the
          IEEE Conference on Computer Vision and Pattern Recognition,
          pp.4295-4304, 2015.`,
      },
    ],
  },
  {
    key: 2,
    thumbnail: techblog2Thumb,
    title: `(Paper review) Encoding features robust to unseen modes of variation with attentive long short-term memory`,
    subtitle: [
      `When classifying the dynamics in video sequences of facial expressions, what could significantly hurt the quality of encoded features—some of which are irrelevant to the facial expression recognition task—is called mode of variation: subject appearance variations, viewpoint variations, illumination or even body posture variations. This paper presents a method to improve the classification performance by minimizing the impact of mode of variation.`,
    ],
    author_thumbnail: JongHwa,
    subscription: 'Video based Facial Expression Recognition',
    author: 'Jong Hwa Lee',
    position: 'AI Scientist',
    time: 'Mar 3, 2021',
    originalLink: 'https://blog.genesislab.ai/?p=12913',
    category: ['Detection'],
    description: [
      {
        type: 'text',
        content:
          'This paper is the product of an industry-academic project between KAIST IVY Lab and Genesis Lab.',
      },
      { type: 'title', content: 'Original Paper' },
      {
        type: 'link',
        content:
          'Encoding features robust to unseen modes of variation with attentive long short-term memory',
        link: 'https://www.sciencedirect.com/science/article/abs/pii/S0031320319304595',
      },
      { type: 'title', content: 'Introduction' },
      {
        type: 'text',
        content: `When using deep learning to recognize a person's facial expression, there is a lot of information in the data that can interfere with the task. For example, appearance variations, viewpoint variations, body posture variations, or even illumination acts as a hindrance to accurate recognition of facial expressions. These disturbances are called mode of variation. In order to overcome these barriers, a method has also been used to expose its model to as many variations as possible so that the model can recognize variations on its own. However, when it comes to using videos to get encoded features, obtaining training datasets that could contain all possible modes of variation is out of the question. In other words, unseen mode of variation should always exist and this paper proposes a method that can minimize it.`,
      },
      { type: 'title', content: 'Proposed method' },
      { type: 'img', content: techblog2fig1 },
      {
        type: 'fig-desc',
        content: `Fig 1. Overview of the proposed method for encoding features robust to unseen modes of variation.`,
      },
      {
        type: 'text',
        content: `This paper divides continuous input data such as video sequences into two types: task-relevant dynamic sequence features and task-irrelevant static sequence features. Task-relevant dynamic sequence features to encode dynamic features mean the parts where there is a change in a previous scene and a current one when the video is divided into frames. On the other hand, task-irrelevant static sequence features are unchanging parts of the image. The task-irrelevant static sequence features are to encode mode of variation, a barrier to given recognition tasks. This separately encoded feature information yields the Spatio-temporal feature robust to unseen mode variations not to be affected by variations unseen during the training. More details are as follows:`,
      },
      { type: 'img', content: techblog2fig2 },
      { type: 'fig-desc', content: `Fig. 2. The proposed attentive mode variational LSTM.` },
      { type: 'title', content: 'Input signal separator' },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'text',
            content:
              'The input signal separator in Fig 2, inspired by the concept of the element-wise attention gate proposed in [4], separates input sequence features into two parts: task-relevant dynamic sequence features ',
          },
          { inlineType: 'math', content: `x_{t}^{att}` },
          { inlineType: 'text', content: ' and task-irrelevant static sequence features ' },
          { inlineType: 'math', content: `x_{t}^{att}.a_{t}` },
          {
            inlineType: 'text',
            content: '(the element-wise dynamics attention) is necessary to get ',
          },
          { inlineType: 'math', content: `x_{t}^{att}` },
          { inlineType: 'text', content: ' and ' },
          { inlineType: 'math', content: `\\hat x_{t}^{att}` },
          {
            inlineType: 'text',
            content: ', and the element-wise dynamics attention gate helps get ',
          },
          { inlineType: 'math', content: `a_{t}` },
          { inlineType: 'text', content: ` as shown in the equation below.` },
        ],
      },
      {
        type: 'math',
        content: `a_{t} = \\alpha(W_{xa}x_{t}+W_{ha}H_{t-1}+b_{a})`,
      },
      {
        type: 'inline-math',
        content: [
          { inlineType: 'math', content: `x_{t}^{att}` },
          {
            inlineType: 'text',
            content: ' can be obtained by multiplying ',
          },
          { inlineType: 'math', content: `a_{t}` },
          { inlineType: 'text', content: ' by time (t). Then, ' },
          { inlineType: 'math', content: `x_{t}^{att}` },
          {
            inlineType: 'text',
            content: ' is used to emphasize dynamic features in the input sequence [4].',
          },
        ],
      },
      {
        type: 'math',
        content: `x_{t}^{att}=a_{t} \\odot x_{t}`,
      },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'text',
            content:
              'Please note that in the paper [4], static sequences are completely ignored, whereas in this paper they are used to encode mode of variations that prevents the recognition and identification of features Task-irrelevant static sequence features use the remaining area where ',
          },
          { inlineType: 'math', content: 'a_{t}' },
          { inlineType: 'text', content: ' has been removed from the entire input data.' },
        ],
      },
      {
        type: 'math',
        content: `\\hat x_{t}^{att}=(1-a_{t}) \\odot x_{t}`,
      },
      {
        type: 'title',
        content: 'Encoding dynamic features and mode of variation',
      },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'text',
            content: 'Encoding dynamic features and mode of variation ',
          },
          { inlineType: 'math', content: `x_{t}^{att}` },
          {
            inlineType: 'text',
            content: ' through the input gate and the target gate are stored in memory cell ',
          },
          { inlineType: 'math', content: `c_{t}.\\hat x_{t}^{att}` },
          {
            inlineType: 'text',
            content:
              ' also goes through the input gate and the forget gate to obtain the mode of variation which is stored in another memory cell ',
          },
          { inlineType: 'math', content: `\\hat c_{t}` },
          {
            inlineType: 'text',
            content:
              '. The stored feature data in the two memory cells then pass through one shared output gate and, by repeating the process several times, spatio-temporal feature information robust to unseen mode variations is obtained. Notice that only one output gate is used. This is primarily the case for two reasons: (1) to synchronize dynamics features and mode of variation and (2) to minimize the effect of the mode of variation on the dynamics features encoded by bringing the current and previous mode of variation together.',
          },
        ],
      },
      { type: 'title', content: 'Experiment' },
      {
        type: 'text',
        content:
          'To validate the effectiveness of the proposed model, two tasks have been performed: facial expression recognition task and human action recognition task. For facial expression recognition, three datasets were used. Oulu-CASIA facial expression dataset in which sequences of the six basic facial expressions (i.e., angry, disgust, fear, happy, sad, and surprise) were collected from 80 subjects under three different illumination conditions; AFEW dataset to emulate real-world conditions collected from movies; and the KAIST face multi-pose multi-illumination (KAIST Face MPMI) dataset that, via thirteen web cameras, simultaneously recorded each expression sequence of the seven expressions (the six basic expressions and a neutral expression sequence) from 104 subjects.',
      },
      {
        type: 'text',
        content:
          'The object of experiment 1 was to compare performance with previously proposed models. Table 1, 2, and 3 show the results for each data set, and the model this thesis proposes outperforms state-of-the-art methods on both tasks.',
      },
      { type: 'img', content: techblog2table1 },
      { type: 'img', content: techblog2table2 },
      { type: 'img', content: techblog2table3 },
      {
        type: 'text',
        content:
          'Experiment 2 evaluates the robustness of the proposed method to encode features robust to unseen modes of variation. In the experiment, Oulu-CASIA dataset is divided into Asian and Finnish (Caucasian) subjects to see if appearance reduces facial expression recognition rates. Table 4 is the result of two tests: when we first trained the models using the Asian subjects and validate the performance on the Caucasian subjects and vice versa. This result shows that the proposed model in this paper encodes features that are more robust to subject appearance variations.',
      },
      { type: 'img', content: techblog2fig4 },
      {
        type: 'text',
        content:
          'Another experiment was carried out with the KAIST Face MPMI dataset to assess the robustness of the proposed model towards unseen illumination variations. To that end, the models are trained using only the sequences captured under the room illumination conditions and tested on all illumination variations (i.e., sequences with room illumination condition, bright illumination condition, left illumination condition and right illumination condition). The proposed model shows excellent recognition rates regardless of the direction and strength of illumination.',
      },
      { type: 'img', content: techblog2fig5 },
      { type: 'title', content: 'Conclusion' },
      {
        type: 'text',
        content:
          'In this paper, we proposed a method to encode features robust to unseen modes of variation. Attentive mode variable LSTM, the core of this proposal, uses the concept of attention to separate the input sequence into task-relevant dynamic sequence features and task-irrelevant static sequence features. Each of the features is used to encode dynamics features and mode of variation, and the encoded information is converted into a Spatio-temporal feature robust to unseen mode variations through a single shared output gate. Experiments conducted on two tasks, facial expression recognition and human action recognition, demonstrated the effectiveness of the proposed method and proved to be superior to the state of the art (SOTA) in each task.',
      },
      { type: 'title', content: 'References' },
      {
        type: 'text',
        content:
          '[1] M. Wang , Y. Panagakis , P. Snape , S.P. Zafeiriou , Disentangling the modes of variation in unlabelled data, IEEE Trans. Pattern Anal. Mach. Intell. (2017)',
      },
      {
        type: 'text',
        content:
          '[2] A. Shahroudy , J. Liu , T.-.T. Ng , G. Wang , NTU RGB + D: a large scale dataset for 3D human activity analysis, in: Proceedings of the IEEE Conference on Com- puter Vision and Pattern Recognition, 2016, pp. 1010–1019 .',
      },
      {
        type: 'text',
        content:
          '[3] P. Zhang , C. Lan , J. Xing , W. Zeng , J. Xue , N. Zheng , View adaptive recurrent neural networks for high performance human action recognition from skeleton data, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 2117–2126',
      },
      {
        type: 'text',
        content:
          '[4] P. Zhang , J. Xue , C. Lan , W. Zeng , Z. Gao , N. Zheng , Adding attentiveness to the neurons in recurrent neural networks, in: Proceedings of the European Confer- ence on Computer Vision (ECCV), 2018, pp. 135–151 . [16] W.J. Baddar , Y.M. Ro , Mode variational lstm robust to unseen',
      },
    ],
  },
  {
    key: 3,
    thumbnail: techblog3Thumb,
    title: `(Publication) Leveraging the Generalization Ability of Deep Convolutional Neural Networks for Improving Classifiers for Color Fundus Photographs`,
    author_thumbnail: JaeYoung,
    author: 'Jae Young Kim',
    position: 'AI Scientist',
    time: 'Mar 19, 2021',
    originalLink: 'https://blog.genesislab.ai/?p=12957',
    category: ['Prediction & Analysis'],
    description: [
      { type: 'title', content: 'Paper' },
      {
        type: 'link',
        content:
          'Leveraging the Generalization Ability of Deep Convolutional Neural Networks for Improving Classifiers for Color Fundus Photographs',
        link: 'https://www.mdpi.com/2076-3417/11/2/591/htm',
      },
      { type: 'title', content: 'Author' },
      {
        type: 'text',
        content: `Jaemin Son, Jaeyoung Kim, Seo Taek Kong, and Kyu-Hwan Jung`,
      },
      { type: 'title', content: 'Author Contributions' },
      {
        type: 'text',
        content: `Author Contributions: Conceptualization, J.S., J.K. and S.T.K.;Formal Analysis, J.S. and J.K.; Methodology, J.S., J.K., S.T.K. and K.-H.J.; Writing—Original draft, J.S., J.K. and S.T.K.; Writing—Review & Editing, J.S. and K.-H.J.; Supervision, K.-H.J. All authors have read and agreed to the published version of the manuscript.`,
      },
      { type: 'title', content: 'Introduction' },
      {
        type: 'text',
        content: `Deep neural networks (DNNs) achieve their advanced performance through supervised learning, which requires a large amount of annotated data. The annotation task is often crowdsourced for economic efficiency, but the dataset annotated by non-experts with limited supervision may contain inaccurate labels. 
        Noisy labels not only yield DNNs with sub-optimal performance but may also impede their optimization dynamics. 
        Furthermore, since acquiring samples for minor classes is inherently difficult or time-consuming, real-world data often exhibit long-tailed distribution. 
        Therefore, when applying the deep learning (DL) algorithms to practical application, handling those challenges plays a crucial role in the robustness of DNNs.`,
      },
      {
        type: 'text',
        content: `My team proposed a method to address the aforementioned problems and participated in the pathological myopia classification challenge (PALM) competition held at MICCAI, top conference for medical image processing, and won first place in classification challenge. This post introduces various practical techniques covered in the competition and please refer to the paper for more detailed information.`,
      },
      {
        type: 'text',
        content: `Although the proposed method has been applied to medical images that are different from the areas covered by the Genesis Lab, we believe that this post is meaningful because the proposed method can be extended to other domains.`,
      },
      { type: 'title', content: 'Dataset' },
      { type: 'img', content: techblog3fig1 },
      {
        type: 'text',
        content: `Fundus photographs were used in order to classify Pathological myopia (PM), and the images above are examples of fundus images in which PM occurred. PM could be diagnosed due to other clinical evidence such as retinal detachment or characteristics of tigroid patterns. PM typically occurs in 0.9–3.1% among Asian ethnicities and 1.2% among Australian ethnicities. In contrast to standard deep learning (DL) datasets with millions of annotated samples, the PALM dataset is far smaller with only 400 data and consists of even far fewer positive cases.`,
      },
      { type: 'title', content: 'Method' },
      {
        type: 'text',
        content:
          'In PALM competition, two main methods were used to improve the generalization ability of DNNs:',
      },
      {
        type: 'text',
        content: '1) the use of semi-supervised learning (SSL) using unlabeled samples',
      },
      {
        type: 'text',
        content:
          '2) the design of filtration network that detect clean positive data to reduce the effect of noisy dataset; the network serves to identify mislabeled cases in the training set.',
      },
      {
        type: 'title',
        content: 'Leveraging the Generalization Ability of DNNs for pseudo-labeling',
      },
      {
        type: 'text',
        content:
          'One of the ways to compensate for the lack of data is SSL using unlabeled data, among which pseudo-labeling is a method proven to be effective in many studies. However, the existing pseudo-labeling procedure typically deals with cases with balanced distributions between classes and thus, when pseudo-labels are generated by a model trained in a long-tailed distribution dataset, the artificial labels are inaccurate for minor classes. Therefore, in order to obtain a more accurate pseudo-label, we hypothesize as follows, by considering the prior knowledge of the domain and generalization ability:',
      },
      {
        type: 'text',
        content:
          '1. Since the rate at which pathological myopia (PM) usually occurs is 0.9–3.1%, the number of PM cases in public fundus images would not outnumber that of normal cases unless it was a PM dataset.',
      },
      {
        type: 'text',
        content:
          '2. At the beginning of training, DNNs first learn the general feature of a dataset, even in the presence of noises, and memorization follows as the training progresses.',
      },
      {
        type: 'text',
        content: `Based on the above hypotheses, unlabeled public dataset was collected and ‘normal’ class was assigned to all the unlabeled images, then incorporating both the public dataset and the PALM dataset. It leads to noises, however, as all the labels in the public dataset have been assumed to be ‘normal,’ but a method of utilizing predictions made in the initial training is used to generate pseudo-labeling for the public dataset. It was then re-trained from scratch using this artificially annotated public dataset and validated using the PALM dataset. A total of 91,509 unlabeled fundus images were acquired from publicly available datasets Kaggle [1], Messidor [2], IDRiD [3], REFUGE [4], and RIGA [5].`,
      },
      { type: 'title', content: `Filtering Suspicious Data` },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'text',
            content:
              'To reduce the negative effect of noisy dataset, filtration networks detecting clean positive data had been designed and clean data identified from filtration networks ',
          },
          { inlineType: 'math', content: 'b' },
          { inlineType: 'text', content: ' were included in PM classification model ' },
          { inlineType: 'math', content: `f \\rightarrow \\hat{p}\\in\\Delta(\\mathbb{y})` },
          { inlineType: 'text', content: ' training.' },
        ],
      },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'math',
            content: 'b',
          },
          {
            inlineType: 'text',
            content: ' detects clean positive data using posterior distribution of ',
          },
          { inlineType: 'math', content: 'D_{val}' },
          { inlineType: 'text', content: ' and ' },
          { inlineType: 'math', content: 'f.' },
          { inlineType: 'text', content: ' The training of filtration networks ' },
          { inlineType: 'math', content: 'b' },
          {
            inlineType: 'text',
            content: ' is trained at each validation step and takes a posterior distribution ',
          },
          { inlineType: 'math', content: `\\hat{p}` },
          { inlineType: 'text', content: ' of ' },
          { inlineType: 'math', content: 'f' },
          { inlineType: 'text', content: ' as an input. Optimizing ' },
          { inlineType: 'math', content: 'b' },
          { inlineType: 'text', content: ' is done by minimizing errors between output of ' },
          { inlineType: 'math', content: 'b' },
          { inlineType: 'text', content: ' and ' },
          { inlineType: 'math', content: 'y_{val}' },
          {
            inlineType: 'text',
            content:
              '. Because the filtration network was trained on data with clean labels, a well-trained filtration network would identify clean positive data by predicting high values on positive images with clean labels and predicting low values for suspiciously negative images. As ',
          },
          { inlineType: 'math', content: 'b^*' },
          {
            inlineType: 'text',
            content:
              ' is trained at each validation step (one training epoch), logistic regression model ',
          },
          { inlineType: 'math', content: `(b(\\hat{p})=w^T\\hat{p})` },
          {
            inlineType: 'text',
            content: ' is used for training time efficiency. Consequently, if ',
          },
          { inlineType: 'math', content: 'b^*' },
          { inlineType: 'text', content: ' detect the clean data with higher confidence than ' },
          { inlineType: 'math', content: '\\tau' },
          { inlineType: 'text', content: ' for  train samples ' },
          { inlineType: 'math', content: 'D_{train}' },
          { inlineType: 'text', content: ' the data contribute to the training of ' },
          { inlineType: 'math', content: 'f.:' },
        ],
      },
      {
        type: 'math',
        content: `\\theta_f = \\theta_f - \\eta \\nabla L(\\theta_f;x,y)\\mathbf{1}\\{b^*(\\hat{p})>\\tau\\}`,
      },
      { type: 'img', content: techblog3table1 },
      {
        type: 'inline-math',
        content: [
          {
            inlineType: 'text',
            content:
              'To verify the validity of the filtration network, quantitative results were compared with those of existing state-of-the-art methods using Kaggle 2015 dataset with 35,126 images (17,563 eyes) for training and 53,576 images (26,788 eyes), and the proposed method achieved the highest accuracy in ',
          },
          {
            inlineType: 'math',
            content: 'p < 0.5',
          },
          { inlineType: 'text', content: '(Table 1).' },
        ],
      },
      { type: 'title', content: 'Conclusion' },
      {
        type: 'text',
        content:
          'On this post, we present a method for filtering noisy labels based on the classifier’s confidence and SSL based on pseudo-labels using publicly-available unlabeled data. Our noisy label filtration method outperformed existing methods in the presence of noisy data, and the pseudo-labeling procedure using generalization ability of DNNs was effective in identifying rare positive cases, having achieved high performance by enabling unfamiliar patterns to be trained, a difficulty in original small training set. In the end, we attained an AUROC of 0.9993 on the PALM competition, ranking first on the off-site validation set.',
      },
      { type: 'img', content: techblog3fig2 },
      {
        type: 'text',
        content:
          'The approach proposed on the PALM competition was applied to medical data, but we expect it to be applicable in various real-world settings that require a large volume of labeled data.',
      },

      { type: 'title', content: 'References' },
      {
        type: 'text',
        content:
          '[1] Kaggle Diabetic Retinopathy Detection Competition Report. 2015. (accessed on 20 May 2019).',
      },
      {
        type: 'text',
        content:
          '[2] Decencière, E.; Zhang, X.; Cazuguel, G.; Lay, B.; Cochener, B.; Trone, C.; Gain, P.; Ordonez, R.; Massin, P.; Erginay, A.; et al. Feedback on a publicly distributed database: the Messidor database. Image Anal. Stereol. 2014, 33, 231–234. ',
      },
      {
        type: 'text',
        content:
          '[3] Porwal, P.; Pachade, S.; Kamble, R.; Kokare, M.; Deshmukh, G.; Sahasrabuddhe, V.; Meriaudeau, F. Indian Diabetic Retinopathy Image Dataset (IDRiD): A Database for Diabetic Retinopathy Screening Research. Data 2018, 3, 25.',
      },
      {
        type: 'inline-link',
        content: [
          {
            inlineType: 'text',
            content: '[4] Retinal Fundus Glaucoma Challenge. Available online: ',
          },
          {
            inlineType: 'link',
            content: 'http://refuge.grand-challenge.org',
            link: 'https://refuge.grand-challenge.org/',
          },
          { inlineType: 'text', content: ' (accessed on 20 May 2019).' },
        ],
      },
      {
        type: 'text',
        content: `[5] Almazroa, A.; Alodhayb, S.; Osman, E.; Ramadan, E.; Hummadi, M.; Dlaim, M.; Alkatee, M.; Raahemifar, K.; Lakshminarayanan, V. Retinal fundus images for glaucoma analysis: The RIGA dataset. In Proceedings of the Medical Imaging 2018: Imaging Informatics for Healthcare, Research, and Applications. International Society for Optics and Photonics, Houston, TX, USA,
        10–15 February 2018; Volume 10579, p. 105790B.`,
      },
    ],
  },
];
