From 3f80058802cb75d53efbad99d818bbb575dc586c Mon Sep 17 00:00:00 2001
From: Mihai Todor <mihaitodor@optum.com>
Date: Thu, 5 Oct 2023 17:14:43 +0300
Subject: [PATCH] Test SPDI 3 bit packing RefSeq scheme

---
 .github/workflows/main.yml      |  12 ++--
 README.md                       |   6 ++
 app/__init__.py                 |   4 --
 app/api_spec.yml                |  55 +++++++++++---
 app/refseq.py                   |  28 --------
 app/utilities_endpoints.py      |  12 +++-
 fetch_refseq.sh                 |   8 +--
 utilities/SPDI_Normalization.py | 124 ++++++++++++++++----------------
 8 files changed, 137 insertions(+), 112 deletions(-)
 delete mode 100644 app/refseq.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 318b9baf..580e6cfd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,20 +28,24 @@ jobs:
           args: --extend-ignore E501,E741
 
       - name: Run Tests
-        run: python -m pytest
+        # run: ./fetch_refseq.sh python -m pytest
+        run: "echo ${heroku_app_name} && false"
+        with:
+          heroku_app_name: ${{secrets.HEROKU_DEV_APP_NAME}}
 
+  # TODO: Add a way to deploy to Prod manually
   deploy:
     name: Deploy
     runs-on: ubuntu-latest
 
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && always() && contains(join(needs.*.result, ','), 'success') }}
+    if: ${{ contains(join(needs.*.result, ','), 'success') }}
     needs: [test]
 
     steps:
       - uses: actions/checkout@v2
 
-      - uses: akhileshns/heroku-deploy@v3.12.12
+      - uses: akhileshns/heroku-deploy@v3.12.14
         with:
           heroku_api_key: ${{secrets.HEROKU_API_KEY}}
-          heroku_app_name: ${{secrets.HEROKU_APP_NAME}}
+          heroku_app_name: ${{secrets.HEROKU_DEV_APP_NAME}}
           heroku_email: ${{secrets.HEROKU_EMAIL}}
diff --git a/README.md b/README.md
index 0ef80cf0..f4785b1e 100644
--- a/README.md
+++ b/README.md
@@ -48,3 +48,9 @@ run `python3 -m pytest` from the terminal to execute them all.
 Additionally, since the tests run against the Mongo DB database, if you need to update the test data in this repo, you
 can run `OVERWRITE_TEST_EXPECTED_DATA=true python3 -m pytest` from the terminal and then create a pull request with the
 changes.
+
+## Development environment on Heroku
+
+Pull requests will trigger a deployment to this environment automatically which is accessible at the following URL:
+
+https://fhir-gen-ops-dev-ca42373833b6.herokuapp.com/
diff --git a/app/__init__.py b/app/__init__.py
index ea431410..d806ce99 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -2,13 +2,9 @@
 import flask
 from flask_cors import CORS
 import os
-# from .refseq import download_refseq_files
 
 
 def create_app():
-    # First ensure we have the refseq files locally
-    # download_refseq_files()
-
     # App and API
     options = {
         'swagger_url': '/',
diff --git a/app/api_spec.yml b/app/api_spec.yml
index 73ba7c99..23403ce7 100644
--- a/app/api_spec.yml
+++ b/app/api_spec.yml
@@ -102,7 +102,7 @@ paths:
             type: boolean
             default: false
           description: Include sequence phase relationships in response if set to true.
-  
+
   /subject-operations/genotype-operations/$find-subject-specific-variants:
     get:
       description: >
@@ -177,7 +177,7 @@ paths:
               - "germline"
               - "somatic"
           description: Enables an App to limit results to those that are 'germline' or 'somatic'. Default is to include variants irrespective of genomic source class.
-  
+
   /subject-operations/genotype-operations/$find-subject-structural-intersecting-variants:
     get:
       description: >
@@ -262,7 +262,7 @@ paths:
             type: boolean
             default: false
           description: Include variants in response if set to true.
-  
+
   /subject-operations/genotype-operations/$find-subject-structural-subsuming-variants:
     get:
       description: >
@@ -346,7 +346,7 @@ paths:
             type: boolean
             default: false
           description: Include variants in response if set to true.
-  
+
   /subject-operations/genotype-operations/$find-subject-haplotypes:
     get:
       description: >
@@ -422,7 +422,7 @@ paths:
               - "germline"
               - "somatic"
           description: Enables an App to limit results to those that are 'germline' or 'somatic'. Default is to include variants irrespective of genomic source class.
-  
+
   /subject-operations/genotype-operations/$find-subject-specific-haplotypes:
     get:
       description: >
@@ -497,7 +497,7 @@ paths:
               - "germline"
               - "somatic"
           description: Enables an App to limit results to those that are 'germline' or 'somatic'. Default is to include variants irrespective of genomic source class.
-  
+
   /subject-operations/phenotype-operations/$find-subject-tx-implications:
     get:
       description: |-
@@ -614,7 +614,7 @@ paths:
               - "germline"
               - "somatic"
           description: Enables an App to limit results to those that are 'germline' or 'somatic'. Default is to include variants irrespective of genomic source class.
-  
+
   /subject-operations/phenotype-operations/$find-subject-dx-implications:
     get:
       description: |-
@@ -713,7 +713,7 @@ paths:
               - "germline"
               - "somatic"
           description: Enables an App to limit results to those that are 'germline' or 'somatic'. Default is to include variants irrespective of genomic source class.
-  
+
   /subject-operations/metadata-operations/$find-study-metadata:
     get:
       description: |-
@@ -1147,6 +1147,7 @@ paths:
             type: string
             pattern: '^\s*[Nn][Pp]_\d{4,10}(\.)?(\d{1,2})?\s*$'
           example: "NP_000005.3"
+
   /utilities/find-the-gene:
     get:
       summary: "Find The Gene"
@@ -1170,6 +1171,42 @@ paths:
             pattern: '^\s*[Nn][Cc]_\d{4,10}(\.)(\d{1,2}):\d{1,10}-\d{1,10}\s*$'
             example: "NC_000001.11:11794399-11794400"
 
+  /utilities/seqfetcher/1/sequence/{ref_seq}:
+    get:
+      summary: "Seqfetcher"
+      operationId: "app.utilities_endpoints.seqfetcher"
+      tags:
+        - "Seqfetcher Utility"
+      responses:
+        '200':
+          description: "Returns RefSeq subsequence"
+          content:
+            text/plain:
+              schema:
+                type: string
+      parameters:
+        - name: ref_seq
+          in: path
+          required: true
+          description: RefSeq
+          schema:
+            type: string
+            example: "NC_000001.10"
+        - name: start
+          in: query
+          required: true
+          description: Subsequence start index
+          schema:
+            type: integer
+            example: 1
+        - name: end
+          in: query
+          required: true
+          description: Subsequence end index
+          schema:
+            type: integer
+            example: 2
+
 tags:
   - name: Subject Genotype Operations
   - name: Subject Phenotype Operations
@@ -1178,7 +1215,7 @@ tags:
   - name: Population Phenotype Operations
   - name: Feature Coordinates Utility
     description: This utility returns genomic feature coordinates and other annotations. All data are from <a href="https://www.ncbi.nlm.nih.gov/genome/guide/human/">NCBI Human Genome Resources</a>. For chromosomes, build 37 and build 38 reference sequences are returned. For genes, genomic coordinates are returned, along with a list of transcripts. MANE transcript is flagged. For transcripts, genomic coordinates are returned, along with the gene name and composite exons, along with exon coordinates. For proteins, the corresponding transcript is returned.
-  
+
   - name: Find The Gene Utility
     description: This utility returns all genes that intersect with a provided genomic region.
 
diff --git a/app/refseq.py b/app/refseq.py
deleted file mode 100644
index 50dad198..00000000
--- a/app/refseq.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from pathlib import Path
-from urllib.request import urlretrieve
-import tarfile
-import sys
-
-SEQ_FILES_RELEASE_TAG = '113c119'
-
-
-def download_refseq_files():
-    try:
-        # Make sure the parent folder exists
-        Path('refseq').mkdir(exist_ok=True)
-
-        for build in ['GRCh37', 'GRCh38']:
-            filename = build + 'seq.tar.gz'
-            filepath = f'refseq/{filename}'
-
-            if not Path(filepath).is_file():
-                # Download seq files
-                urlretrieve('https://github.com/FHIR/genomics-operations/releases/download/' + SEQ_FILES_RELEASE_TAG + '/' +
-                            filename, filepath)
-
-            # Unarchive seq files each time on startup, just in case something went wrong the previous time
-            with tarfile.open(filepath, "r|gz") as f:
-                f.extractall(path='refseq')
-
-    except Exception as err:
-        sys.exit(f"Failed to download refseq files: {err=}, {type(err)=}")
diff --git a/app/utilities_endpoints.py b/app/utilities_endpoints.py
index c2f278ff..03fdc085 100644
--- a/app/utilities_endpoints.py
+++ b/app/utilities_endpoints.py
@@ -1,6 +1,7 @@
 from flask import abort, jsonify
 from collections import OrderedDict
 from app import common
+from utilities import SPDI_Normalization
 
 
 def get_feature_coordinates(
@@ -133,7 +134,8 @@ def get_feature_coordinates(
             protein = protein.split('.')[0]
 
         try:
-            result = common.proteins_data.aggregate([{"$match": {"proteinRefSeq": {'$regex': ".*"+str(protein).replace('*', r'\*')+".*"}}}])
+            result = common.proteins_data.aggregate(
+                [{"$match": {"proteinRefSeq": {'$regex': ".*"+str(protein).replace('*', r'\*')+".*"}}}])
             result = list(result)
         except Exception as e:
             print(f"DEBUG: Error({e}) under get_feature_coordinates(protein={protein})")
@@ -189,3 +191,11 @@ def find_the_gene(range=None):
         output.append(ord_dict)
 
     return (jsonify(output))
+
+
+def seqfetcher(ref_seq, start, end):
+    try:
+        subseq = SPDI_Normalization.get_ref_seq_subseq('GRCh37', ref_seq, start, end)
+    except Exception:
+        subseq = SPDI_Normalization.get_ref_seq_subseq('GRCh38', ref_seq, start, end)
+    return f'>{ref_seq}:{start}-{end} Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly\n{subseq}\n\n'
diff --git a/fetch_refseq.sh b/fetch_refseq.sh
index 9fc41c78..f409c6c4 100755
--- a/fetch_refseq.sh
+++ b/fetch_refseq.sh
@@ -10,12 +10,12 @@ cd ./refseq
 
 echo "Downloading refseq files..."
 
-curl -sLO https://github.com/FHIR/genomics-operations/releases/download/113c119/GRCh37seq.tar.gz
-curl -sLO https://github.com/FHIR/genomics-operations/releases/download/113c119/GRCh38seq.tar.gz
+curl -sLO https://github.com/FHIR/genomics-operations/releases/download/113c119/GRCh37_refseq.tar.gz
+curl -sLO https://github.com/FHIR/genomics-operations/releases/download/113c119/GRCh38_refseq.tar.gz
 
 echo "Extracting refseq files..."
 
-tar -xzf GRCh37seq.tar.gz
-tar -xzf GRCh38seq.tar.gz
+tar -xzf GRCh37_refseq.tar.gz
+tar -xzf GRCh38_refseq.tar.gz
 
 echo "Finished extracting refseq files."
diff --git a/utilities/SPDI_Normalization.py b/utilities/SPDI_Normalization.py
index 3969270a..dc2dd890 100644
--- a/utilities/SPDI_Normalization.py
+++ b/utilities/SPDI_Normalization.py
@@ -1,99 +1,99 @@
 from glob import glob
-from math import ceil
+from math import floor
 from pathlib import Path
 from threading import Lock
-from memory_profiler import profile
 
-MAX_REFSEQ_CACHE_SIZE = 200 * 1024 * 1024  # 450MB
 
-
-def hex_to_code(code):
+def hex_to_code(hex):
     """
-    Convert a 4 byte integer to a sequence code. See `code2Hex()` for details
+    Convert a 3 bit integer to a sequence code. See `code_to_hex()` in pack_refseq.py for details.
     """
-    match code:
+    match hex:
         case 0x0: return 'A'
         case 0x1: return 'C'
         case 0x2: return 'G'
         case 0x3: return 'T'
-        case 0x4: return 'U'
-        case 0x5: return 'R'
-        case 0x6: return 'Y'
-        case 0x7: return 'K'
-        case 0x8: return 'M'
-        case 0x9: return 'S'
-        case 0xA: return 'W'
-        case 0xB: return 'B'
-        case 0xC: return 'D'
-        case 0xD: return 'H'
-        case 0xE: return 'V'
-        case 0xF: return 'N'
+        case 0x4: return 'N'
+        # Blow up if we get any unexpected hex-encoded code
         case _:
-            raise NotImplementedError(f"unsupported code '{code}'")
-
-
-def refseq_length_to_size(length):
-    # Each byte contains at most 2 codes
-    return ceil(length / 2)
+            raise NotImplementedError(f"unexpected hex-encoded code: '{hex}'")
 
 
 class RefSeq:
-    def __init__(self, refSeq, length):
-        """
-        Store the length so we can easily tell when the index operator receives an out of bounds subscript because we
-        want to represent each symbol using 4 bits and we have 16 symbols, so there's no room left to represent an
-        invalid symbol which denotes end-of-sequence.
-        """
-        self.__packedRefSeq = refSeq
-        self.__length = length
-
-        # Size in bytes
-        self.size = refseq_length_to_size(length)
+    # TODO: Consider adding `__str__()` method (and maybe `__repr__()` too?)
+
+    def __init__(self, packed_ref_seq, len):
+        self.__packed_ref_seq = packed_ref_seq
+        # Store the RefSeq length so we can easily tell when the index operator receives an out of bounds subscript
+        # because we want to represent each code using 3 bits and we don't want to introduce an end-of-sequence code, in
+        # case we'll want to leverage the unused codes for something else.
+        self.__len = len
 
     def __getitem__(self, subscript):
         if isinstance(subscript, slice):
             codes = []
-            for i in range(*subscript.indices(self.__length)):
+            # TODO: It should be more efficient to process 3 bytes at a time for long ranges.
+            for i in range(*subscript.indices(self.__len)):
                 codes.append(self[i])
             return "".join(codes)
 
-        byte = self.__packedRefSeq[subscript // 2]
+        if subscript >= self.__len:
+            raise IndexError(f"list index out of range: '{subscript}' must be less than '{self.__len}'")
+
+        # Select the byte(s) and the offset of the code we wish to extract from the packed RefSeq data.
+        packed_subscript = floor(subscript * 3 / 8)
+        data = self.__packed_ref_seq[packed_subscript]
+        match subscript % 8:
+            case 0: offset = 0
+            case 1: offset = 3
+            case 2:
+                offset = 6
+                # This code is packed across two bytes.
+                data += self.__packed_ref_seq[packed_subscript + 1] << 8
+            case 3: offset = 1
+            case 4: offset = 4
+            case 5:
+                offset = 7
+                # This code is packed across two bytes.
+                data += self.__packed_ref_seq[packed_subscript + 1] << 8
+            case 6: offset = 2
+            case 7: offset = 5
+
+        # Move the relevant three bits to the bottom, extract them using a bit mask and convert them back to a code.
+        return hex_to_code((data >> offset) & 0x3)
 
-        if subscript % 2 == 0:
-            return hex_to_code((byte >> 4) & 0xF)
 
-        return hex_to_code(byte & 0xF)
+def get_ref_seq(build, ref_seq_name):
+    try:
+        ref_seq_file_pattern = f'refseq/{build}/{ref_seq_name}_*.refseq'
+        found_files = glob(ref_seq_file_pattern)
+        if len(found_files) != 1:
+            raise ValueError(f'failed to find expected refseq file for build "{build}" and RefSeq "{ref_seq_name}" ')
 
+        ref_seq_file = found_files[0]
+        ref_seq_length = int(Path(ref_seq_file).stem.rpartition('_')[-1])
 
-refSeqLock = Lock()
+        with open(ref_seq_file, mode='rb') as file:
+            refSeq = RefSeq(file.read(), ref_seq_length)
+    except Exception as err:
+        print(f"failed to read refseq file: {err=}, {type(err)=}")
+        raise
 
+    return refSeq
 
-def get_ref_seq(build, refSeqName):
-    if not refSeqCache.contains(build, refSeqName):
-        try:
-            refSeqFilePattern = f'refseq/{build}/{refSeqName}_*.seq'
-            foundFiles = glob(refSeqFilePattern)
-            if len(foundFiles) != 1:
-                raise ValueError(f'failed to find expected refseq file for build "{build}" and RefSeq "{refSeqName}" ')
 
-            refSeqFile = foundFiles[0]
-            refSeqLength = int(Path(refSeqFile).stem.rpartition('_')[-1])
+ref_seq_lock = Lock()
 
-            # Make room for this refseq in the cache
-            refSeqCache.reserve_space(refSeqLength)
 
-            with open(refSeqFile, mode='rb') as file:
-                refSeq = RefSeq(file.read(), refSeqLength)
-        except Exception as err:
-            print(f"failed to read refseq file: {err=}, {type(err)=}")
-            raise
-        refSeqCache.add(build, refSeqName, refSeq)
-    return refSeqCache.get(build, refSeqName)
+def get_ref_seq_subseq(build, ref_seq, start, end):
+    # Need to serialise this if we can't keep all the RefSeq data in memory
+    with ref_seq_lock:
+        return get_ref_seq(build, ref_seq)[start:end]
 
 
 def get_normalized_spdi(ref_seq, pos, ref, alt, build):
-    # Need to serialise this if we can't keep all the ref seq data in memory
-    with refSeqLock:
+    # Need to serialise this if we can't keep all the RefSeq data in memory
+    with ref_seq_lock:
         return get_normalized_spdi_impl(ref_seq, pos, ref, alt, build)