/***
 *  $Id$
 **
 *  File: sequence_compare.hpp
 *  Created: May 03, 2012
 *
 *  Author: Jaroslaw Zola <jaroslaw.zola@hush.com>
 *  Copyright (c) 2012-2017 Jaroslaw Zola
 *  Distributed under the Boost Software License, Version 1.0.
 *
 *  Boost Software License - Version 1.0 - August 17th, 2003
 *
 *  Permission is hereby granted, free of charge, to any person or organization
 *  obtaining a copy of the software and accompanying documentation covered by
 *  this license (the "Software") to use, reproduce, display, distribute,
 *  execute, and transmit the Software, and to prepare derivative works of the
 *  Software, and to permit third-parties to whom the Software is furnished to
 *  do so, all subject to the following:
 *
 *  The copyright notices in the Software and this entire statement, including
 *  the above license grant, this restriction and the following disclaimer,
 *  must be included in all copies of the Software, in whole or in part, and
 *  all derivative works of the Software, unless such copies or derivative
 *  works are solely in the form of machine-executable object code generated by
 *  a source language processor.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 *  SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 *  FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 *  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *  DEALINGS IN THE SOFTWARE.
 */

#ifndef SEQUENCE_COMPARE_HPP
#define SEQUENCE_COMPARE_HPP

#include <algorithm>
#include <cmath>
#include <cstring>
#include <fstream>
#include <functional>
#include <limits>
#include <map>
#include <string>
#include <vector>

#include <boost/tuple/tuple.hpp>


namespace bio {

  namespace detail {

    // this code comes from jaz
    template <typename Iter1, typename Iter2, typename Pred>
    int intersection_size(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2, Pred pred) {
        int S = 0;

        while ((first1 != last1) && (first2 != last2)) {
            if (pred(*first1, *first2)) ++first1;
            else if (pred(*first2, *first1)) ++first2;
            else {
                first1++;
                first2++;
                S++;
            }
        } // while

        return S;
    } // intersection_size


    template <typename Iter1, typename Iter2>
    int count_distance(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2) {
        int S = 0;

        while ((first1 != last1) && (first2 != last2)) {
            if (first1->first < first2->first) {
                S += (first1->second * first1->second);
                ++first1;
            }
            else if (first2->first < first1->first) {
                S += (first2->second * first2->second);
                ++first2;
            }
            else {
                int d = (first1->second - first2->second);
                S += d * d;
                first1++;
                first2++;
            }
        } // while

        return S;
    } // count_distance


    template <typename Iter1, typename Iter2>
    double cosine_distance(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2) {
        double S = 0.0;

        double p1 = 0.0;
        double p2 = 0.0;

        while ((first1 != last1) && (first2 != last2)) {
            if (first1->first < first2->first) {
                p1 += (first1->second * first1->second);
                ++first1;
            }
            else if (first2->first < first1->first) {
                p2 += (first2->second * first2->second);
                ++first2;
            }
            else {
                S += (first1->second * first2->second);
                p1 += (first1->second * first1->second);
                p2 += (first2->second * first2->second);
                first1++;
                first2++;
            }
        } // while

        return (p1 == 0 || p2 == 0) ? 0 : S / (std::sqrt(p1) * std::sqrt(p2));
    } // cosine_distance


    template <typename Sequence> void general_kmer_index(const std::string& s, int k, Sequence& S) {
        int end = s.size() - k + 1;
        S.resize(end);
        for (int i = 0; i < end; ++i) S[i] = s.substr(i, k);
    } // general_kmer_index


    template <typename Map> void general_kmer_count(const std::string& s, int k, Map& S) {
        int end = s.size() - k + 1;
        S.clear();
        for (int i = 0; i < end; ++i) S[s.substr(i, k)]++;
    } // general_kmer_count


    class dna_digit {
    public:
        dna_digit() {
            std::memset(digit_, 0, 256);
            digit_['c'] = digit_['C'] = 1;
            digit_['g'] = digit_['G'] = 2;
            digit_['t'] = digit_['T'] = 3;
            digit_['u'] = digit_['U'] = 3;
        } // dna_digit

    protected:
        char digit_[256];

    }; // dna_digit


    class dna_kmer_index : public dna_digit {
    public:
        dna_kmer_index() : dna_digit() { }

        template <typename Sequence>
        void operator()(const std::string& s, int k, Sequence& S) {
            int end = s.size() - k + 1;
            S.resize(end);

            // first kmer
            unsigned long long int v = digit_[s[k - 1]];
            for (int i = 0; i < k - 1; ++i) {
                v += digit_[s[i]] * (1ULL << ((k - i - 1) << 1));
            }

            S[0] = v;

            // and then all other
            const unsigned long long int b = 1ULL << ((k - 1) << 1);

            for (int i = 1; i < end; ++i) {
                v = (v - b * digit_[s[i - 1]]) * 4 + digit_[s[i + k - 1]];
                S[i] = v;
            }
        } // operator()

    }; // class dna_kmer_index


    class dna_kmer_count : public dna_digit {
    public:
        dna_kmer_count() : dna_digit() { }

        template <typename Map>
        void operator()(const std::string& s, int k, Map& S) {
            int end = s.size() - k + 1;
            S.clear();

            // first kmer
            unsigned long long int v = digit_[s[k - 1]];
            for (int i = 0; i < k - 1; ++i) v += digit_[s[i]] * (1ULL << ((k - i - 1) << 1));

            S[v] = 1;

            // and then all other
            const unsigned long long int b = 1ULL << ((k - 1) << 1);

            for (int i = 1; i < end; ++i) {
                v = (v - b * digit_[s[i - 1]]) * 4 + digit_[s[i + k - 1]];
                S[v]++;
            }
        } // operator()

    }; // class dna_kmer_count

  } // namespace detail



  /** Class: sequence_compare
   *
   *  A general interface for sequence_compare algorithms.
   *  All algorithms in this library support this interface.
   *  Some, e.g. local_alignment, provide additional methods.
   */
  template <typename Derived> struct sequence_compare {
      boost::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
          return static_cast<Derived*>(this)->operator()(s0, s1);
      }
  }; // struct sequence_compare


  /** Class: d2
   *
   *  Functor to compute the d2 distance.
   */
  class d2 : public sequence_compare<d2> {
  public:
      /** Constructor: d2
       *
       *  Parameter:
       *  k - kmer length.
       *  isdna - assume that input sequences are DNA/RNA.
       */
      explicit d2(int k = 0, bool isdna = true) : k_(k), isdna_(isdna) { }

      /** Function: operator()
       *
       *  Compute d2 score between s0 and s1.
       *
       *  Returns:
       *  3-tuple (d2 score, number of unique kmers in s0, number of unique kmers in s1).
       */
      boost::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
          if ((s0.size() < k_) || (s1.size() < k_)) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dC_(s0, k_, dcount0_);
              dC_(s1, k_, dcount1_);
              int S = detail::count_distance(dcount0_.begin(), dcount0_.end(),
                                             dcount1_.begin(), dcount1_.end());
              return boost::make_tuple(S, dcount0_.size(), dcount1_.size());
          } else {
              detail::general_kmer_count(s0, k_, count0_);
              detail::general_kmer_count(s1, k_, count1_);
              int S = detail::count_distance(count0_.begin(), count0_.end(),
                                             count1_.begin(), count1_.end());
              return boost::make_tuple(S, count0_.size(), count1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

      /** Function: operator()
       *
       *  Compute d2 score between s0 and s1, where s0 is a sequence
       *  from the previous call of the binary version of this operator.
       *
       *  Returns:
       *  3-tuple (d2 score, number of unique kmers in s0, number of unique kmers in s1).
       */
      boost::tuple<int, int, int> operator()(const std::string& s1) {
          if (s1.size() < k_) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dC_(s1, k_, dcount1_);
              int S = detail::count_distance(dcount0_.begin(), dcount0_.end(),
                                             dcount1_.begin(), dcount1_.end());
              return boost::make_tuple(S, dcount0_.size(), dcount1_.size());
          } else {
              detail::general_kmer_count(s1, k_, count1_);
              int S = detail::count_distance(count0_.begin(), count0_.end(),
                                             count1_.begin(), count1_.end());
              return boost::make_tuple(S, count0_.size(), count1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

  private:
      int k_;
      bool isdna_;

      std::map<unsigned long long int, int> dcount0_;
      std::map<unsigned long long int, int> dcount1_;

      std::map<std::string, int> count0_;
      std::map<std::string, int> count1_;

      detail::dna_kmer_count dC_;

  }; // class d2


  class cosine : public sequence_compare<cosine> {
  public:
      explicit cosine(int k = 0, bool isdna = true) : k_(k), isdna_(isdna) { }

      boost::tuple<double, int, int> operator()(const std::string& s0, const std::string& s1) {
          if ((s0.size() < k_) || (s1.size() < k_)) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dC_(s0, k_, dcount0_);
              dC_(s1, k_, dcount1_);
              double S = detail::cosine_distance(dcount0_.begin(), dcount0_.end(),
                                                 dcount1_.begin(), dcount1_.end());
              return boost::make_tuple(S, dcount0_.size(), dcount1_.size());
          } else {
              detail::general_kmer_count(s0, k_, count0_);
              detail::general_kmer_count(s1, k_, count1_);
              double S = detail::cosine_distance(count0_.begin(), count0_.end(),
                                                 count1_.begin(), count1_.end());
              return boost::make_tuple(S, count0_.size(), count1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

      boost::tuple<double, int, int> operator()(const std::string& s1) {
          if (s1.size() < k_) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dC_(s1, k_, dcount1_);
              double S = detail::cosine_distance(dcount0_.begin(), dcount0_.end(),
                                                 dcount1_.begin(), dcount1_.end());
              return boost::make_tuple(S, dcount0_.size(), dcount1_.size());
          } else {
              detail::general_kmer_count(s1, k_, count1_);
              double S = detail::cosine_distance(count0_.begin(), count0_.end(),
                                                 count1_.begin(), count1_.end());
              return boost::make_tuple(S, count0_.size(), count1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

  private:
      int k_;
      bool isdna_;

      std::map<unsigned long long int, int> dcount0_;
      std::map<unsigned long long int, int> dcount1_;

      std::map<std::string, int> count0_;
      std::map<std::string, int> count1_;

      detail::dna_kmer_count dC_;

  }; // class cosine


  /** Class: kmer_fraction
   *
   *  Functor to compute the number of shared kmers between two sequences.
   *  It can be used to compute e.g. kmer fraction similarity, defined as
   *  the Jaccard index between kmer spectra of sequences.
   */
  class kmer_fraction : public sequence_compare<kmer_fraction> {
  public:
      /** Constructor: kmer_fraction
       *
       *  Parameter:
       *  k - kmer length.
       *  isdna - assume that input sequences are DNA/RNA.
       */
      explicit kmer_fraction(int k = 0, bool isdna = true) : k_(k), isdna_(isdna) { }

      /** Function: operator()
       *
       *  Returns:
       *  3-tuple (number of common kmers, number of kmers in s0, number of kmers in s1).
       */
      boost::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
          if ((s0.size() < k_) || (s1.size() < k_)) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dI_(s0, k_, dindex0_);
              std::sort(dindex0_.begin(), dindex0_.end());
              dI_(s1, k_, dindex1_);
              std::sort(dindex1_.begin(), dindex1_.end());
              int S = detail::intersection_size(dindex0_.begin(), dindex0_.end(),
                                                dindex1_.begin(), dindex1_.end(),
                                                std::less<unsigned long long int>());
              return boost::make_tuple(S, dindex0_.size(), dindex1_.size());
          } else {
              detail::general_kmer_index(s0, k_, index0_);
              detail::general_kmer_index(s1, k_, index1_);
              int S = detail::intersection_size(index0_.begin(), index0_.end(),
                                                index1_.begin(), index1_.end(),
                                                std::less<std::string>());
              return boost::make_tuple(S, index0_.size(), index1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

      /** Function: operator()
       *
       *  Compute kmer score between s0 and s1, where s0 is a sequence
       *  from the previous call of the binary version of this operator.
       *
       *  Returns:
       *  3-tuple (number of common kmers, number of kmers in s0, number of kmers in s1).
       */
      boost::tuple<int, int, int> operator()(const std::string& s1) {
          if (s1.size() < k_) return boost::make_tuple(-1, -1, -1);

          if (isdna_) {
              dI_(s1, k_, dindex1_);
              std::sort(dindex1_.begin(), dindex1_.end());
              int S = detail::intersection_size(dindex0_.begin(), dindex0_.end(),
                                                dindex1_.begin(), dindex1_.end(),
                                                std::less<unsigned long long int>());
              return boost::make_tuple(S, dindex0_.size(), dindex1_.size());
          } else {
              detail::general_kmer_index(s1, k_, index1_);
              int S = detail::intersection_size(index0_.begin(), index0_.end(),
                                                index1_.begin(), index1_.end(),
                                                std::less<std::string>());
              return boost::make_tuple(S, index0_.size(), index1_.size());
          }

          return boost::make_tuple(-1, -1, -1);
      } // operator()

  private:
      unsigned int k_;
      bool isdna_;

      std::vector<unsigned long long int> dindex0_;
      std::vector<unsigned long long int> dindex1_;

      std::vector<std::string> index0_;
      std::vector<std::string> index1_;

      detail::dna_kmer_index dI_;

  }; // class kmer_fraction


  class spaced_seeds_fraction : public sequence_compare<spaced_seeds_fraction> {
  public:
      explicit spaced_seeds_fraction(const std::string& sseed) : sseed_(sseed) { }

      boost::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
          int k = sseed_.size();

          detail::general_kmer_index(s0, k, index0_);
          detail::general_kmer_index(s1, k, index1_);

          for (int i = 0; i < index0_.size(); ++i) {
              for (int j = 0; j < k; ++j) if (sseed_[j] == '0') index0_[i][j] = '*';
          }

          for (int i = 0; i < index1_.size(); ++i) {
              for (int j = 0; j < k; ++j) if (sseed_[j] == '0') index1_[i][j] = '*';
          }

          std::sort(index0_.begin(), index0_.end());
          std::sort(index1_.begin(), index1_.end());

          int S = detail::intersection_size(index0_.begin(), index0_.end(),
                                            index1_.begin(), index1_.end(),
                                            std::less<std::string>());

          return boost::make_tuple(S, index0_.size(), index1_.size());
      } // operator()

  private:
      std::string sseed_;

      std::vector<std::string> index0_;
      std::vector<std::string> index1_;

  }; // spaced_seeds_fraction

} // namespace bio

#endif // SEQUENCE_COMPARE_HPP
