Spaces:
Sleeping
Sleeping
kabudadada
commited on
Commit
·
a1f2eee
1
Parent(s):
034d7df
Add essential alphagenome source files only
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- alphagenome/source +0 -1
- alphagenome/source/.gitattributes +1 -0
- alphagenome/source/.github/ISSUE_TEMPLATE/bug_report.yml +48 -0
- alphagenome/source/.github/ISSUE_TEMPLATE/config.yml +5 -0
- alphagenome/source/.github/workflows/presubmit_checks.yml +37 -0
- alphagenome/source/.github/workflows/release.yaml +45 -0
- alphagenome/source/.pylintrc +458 -0
- alphagenome/source/.readthedocs.yaml +42 -0
- alphagenome/source/CHANGELOG.md +42 -0
- alphagenome/source/CONTRIBUTING.md +25 -0
- alphagenome/source/LICENSE +202 -0
- alphagenome/source/README.md +235 -0
- alphagenome/source/__init__.py +4 -0
- alphagenome/source/colabs/batch_variant_scoring.ipynb +0 -0
- alphagenome/source/colabs/essential_commands.ipynb +1405 -0
- alphagenome/source/colabs/example_analysis_workflow.ipynb +0 -0
- alphagenome/source/colabs/quick_start.ipynb +0 -0
- alphagenome/source/colabs/tissue_ontology_mapping.ipynb +0 -0
- alphagenome/source/colabs/visualization_modality_tour.ipynb +0 -0
- alphagenome/source/conftest.py +22 -0
- alphagenome/source/docs/Makefile +20 -0
- alphagenome/source/docs/README.md +6 -0
- alphagenome/source/docs/make.bat +49 -0
- alphagenome/source/docs/source/_templates/autosummary/class.rst +55 -0
- alphagenome/source/docs/source/api/data.md +89 -0
- alphagenome/source/docs/source/api/index.md +46 -0
- alphagenome/source/docs/source/api/interpretation.md +16 -0
- alphagenome/source/docs/source/api/models.md +55 -0
- alphagenome/source/docs/source/api/visualization.md +60 -0
- alphagenome/source/docs/source/conf.py +206 -0
- alphagenome/source/docs/source/exploring_model_metadata.md +93 -0
- alphagenome/source/docs/source/faqs.md +378 -0
- alphagenome/source/docs/source/index.md +82 -0
- alphagenome/source/docs/source/installation.md +73 -0
- alphagenome/source/docs/source/references.md +7 -0
- alphagenome/source/docs/source/refs.bib +50 -0
- alphagenome/source/docs/source/tutorials/index.md +55 -0
- alphagenome/source/docs/source/user_guides/index.md +46 -0
- alphagenome/source/docs/source/variant_scoring.md +257 -0
- alphagenome/source/docs/source/visualization_library_basics.md +153 -0
- alphagenome/source/hatch_build.py +49 -0
- alphagenome/source/pyproject.toml +131 -0
- alphagenome/source/scripts/process_gtf.py +42 -0
- alphagenome/source/src/__init__.py +4 -0
- alphagenome/source/src/alphagenome/__init__.py +18 -0
- alphagenome/source/src/alphagenome/colab_utils.py +62 -0
- alphagenome/source/src/alphagenome/colab_utils_test.py +68 -0
- alphagenome/source/src/alphagenome/data/__init__.py +15 -0
- alphagenome/source/src/alphagenome/data/fold_intervals.py +115 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.ipynb filter=lfs diff=lfs merge=lfs -text
|
alphagenome/source
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit b7d3963ce241c2390ea18bb99fa0722e1c169952
|
|
|
|
|
|
alphagenome/source/.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.ipynb linguist-documentation
|
alphagenome/source/.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: Bug report
|
| 3 |
+
description: >-
|
| 4 |
+
Report a bug or unexpected behavior to help us improve AlphaGenome.
|
| 5 |
+
labels:
|
| 6 |
+
- bug
|
| 7 |
+
|
| 8 |
+
body:
|
| 9 |
+
- type: markdown
|
| 10 |
+
attributes:
|
| 11 |
+
value: >
|
| 12 |
+
## Thank you for helping us improve AlphaGenome!
|
| 13 |
+
|
| 14 |
+
* Please verify that your issue has not been reported using
|
| 15 |
+
[Issue search][issue search].
|
| 16 |
+
|
| 17 |
+
* If you have a question about usage, please
|
| 18 |
+
consider [starting a discussion][Discussions].
|
| 19 |
+
|
| 20 |
+
* If you prefer a non-templated issue report, click [here][Raw report].
|
| 21 |
+
|
| 22 |
+
[Discussions]: https://www.alphagenomecommunity.com/
|
| 23 |
+
|
| 24 |
+
[issue search]: https://github.com/google-deepmind/alphagenome/search?q=is%3Aissue&type=issues
|
| 25 |
+
|
| 26 |
+
[Raw report]: https://github.com/google-deepmind/alphagenome/issues/new?template=none
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: Description
|
| 30 |
+
description: A concise description of the bug.
|
| 31 |
+
placeholder: |
|
| 32 |
+
Text may use markdown formatting.
|
| 33 |
+
```python
|
| 34 |
+
# for codeblocks, use triple backticks
|
| 35 |
+
```
|
| 36 |
+
validations:
|
| 37 |
+
required: true
|
| 38 |
+
- type: textarea
|
| 39 |
+
attributes:
|
| 40 |
+
label: System info (python version, alphagenome version, etc.)
|
| 41 |
+
description: >-
|
| 42 |
+
Include the output of `import alphagenome; alphagenome.__version__`
|
| 43 |
+
placeholder: |
|
| 44 |
+
```
|
| 45 |
+
...
|
| 46 |
+
```
|
| 47 |
+
validations:
|
| 48 |
+
required: true
|
alphagenome/source/.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
| 2 |
+
contact_links:
|
| 3 |
+
- name: Have questions or need support?
|
| 4 |
+
url: https://www.alphagenomecommunity.com/
|
| 5 |
+
about: Please ask questions on our community forums.
|
alphagenome/source/.github/workflows/presubmit_checks.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# https://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
name: CI
|
| 16 |
+
on: [push, pull_request]
|
| 17 |
+
jobs:
|
| 18 |
+
test:
|
| 19 |
+
runs-on: ${{ matrix.os }}
|
| 20 |
+
strategy:
|
| 21 |
+
fail-fast: false
|
| 22 |
+
matrix:
|
| 23 |
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
| 24 |
+
os: [ubuntu-latest]
|
| 25 |
+
steps:
|
| 26 |
+
- uses: actions/checkout@v4
|
| 27 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 28 |
+
uses: actions/setup-python@v5
|
| 29 |
+
with:
|
| 30 |
+
python-version: ${{ matrix.python-version }}
|
| 31 |
+
- name: Install alphagenome with dependencies
|
| 32 |
+
run: |
|
| 33 |
+
python -m pip install -U pip hatch
|
| 34 |
+
- name: Check
|
| 35 |
+
run: python -m hatch run check:all
|
| 36 |
+
- name: Unit tests
|
| 37 |
+
run: python -m hatch test
|
alphagenome/source/.github/workflows/release.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# https://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
name: Release
|
| 16 |
+
|
| 17 |
+
on:
|
| 18 |
+
release:
|
| 19 |
+
types: [published]
|
| 20 |
+
|
| 21 |
+
# Use "trusted publishing", see https://docs.pypi.org/trusted-publishers/
|
| 22 |
+
jobs:
|
| 23 |
+
release:
|
| 24 |
+
name: Upload release to PyPI
|
| 25 |
+
runs-on: ubuntu-latest
|
| 26 |
+
environment:
|
| 27 |
+
name: pypi
|
| 28 |
+
url: https://pypi.org/p/alphagenome
|
| 29 |
+
permissions:
|
| 30 |
+
id-token: write
|
| 31 |
+
steps:
|
| 32 |
+
- uses: actions/checkout@v4
|
| 33 |
+
with:
|
| 34 |
+
filter: blob:none
|
| 35 |
+
fetch-depth: 0
|
| 36 |
+
- name: Set up Python 3.12
|
| 37 |
+
uses: actions/setup-python@v5
|
| 38 |
+
with:
|
| 39 |
+
python-version: 3.12
|
| 40 |
+
- name: Install hatch
|
| 41 |
+
run: python -m pip install -U pip hatch
|
| 42 |
+
- name: Build package
|
| 43 |
+
run: python -m hatch build
|
| 44 |
+
- name: Publish package distributions to PyPI
|
| 45 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
alphagenome/source/.pylintrc
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This Pylint rcfile contains a best-effort configuration to uphold the
|
| 2 |
+
# best-practices and style described in the Google Python style guide:
|
| 3 |
+
# https://google.github.io/styleguide/pyguide.html
|
| 4 |
+
#
|
| 5 |
+
# Its original canonical open-source location is:
|
| 6 |
+
# https://google.github.io/styleguide/pylintrc
|
| 7 |
+
#
|
| 8 |
+
# Also includes some modifications specific to this repository.
|
| 9 |
+
|
| 10 |
+
[MASTER]
|
| 11 |
+
|
| 12 |
+
# Add files or directories to the ignore list. They should be base names, not
|
| 13 |
+
# paths.
|
| 14 |
+
ignore=third_party,
|
| 15 |
+
./src/alphagenome/protos
|
| 16 |
+
|
| 17 |
+
# Add files or directories matching the regex patterns to the ignore list. The
|
| 18 |
+
# regex matches against base names, not paths.
|
| 19 |
+
ignore-patterns=
|
| 20 |
+
|
| 21 |
+
# Pickle collected data for later comparisons.
|
| 22 |
+
persistent=no
|
| 23 |
+
|
| 24 |
+
# List of plugins (as comma separated values of python modules names) to load,
|
| 25 |
+
# usually to register additional checkers.
|
| 26 |
+
load-plugins=
|
| 27 |
+
|
| 28 |
+
# Use multiple processes to speed up Pylint.
|
| 29 |
+
jobs=4
|
| 30 |
+
|
| 31 |
+
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
| 32 |
+
# active Python interpreter and may run arbitrary code.
|
| 33 |
+
unsafe-load-any-extension=no
|
| 34 |
+
|
| 35 |
+
# A comma-separated list of package or module names from where C extensions may
|
| 36 |
+
# be loaded. Extensions are loading into the active Python interpreter and may
|
| 37 |
+
# run arbitrary code.
|
| 38 |
+
extension-pkg-allow-list=
|
| 39 |
+
|
| 40 |
+
# Minimum Python version to use for version dependent checks.
|
| 41 |
+
py-version=3.10
|
| 42 |
+
|
| 43 |
+
[MESSAGES CONTROL]
|
| 44 |
+
|
| 45 |
+
# Only show warnings with the listed confidence levels. Leave empty to show
|
| 46 |
+
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
| 47 |
+
confidence=
|
| 48 |
+
|
| 49 |
+
# Enable the message, report, category or checker with the given id(s). You can
|
| 50 |
+
# either give multiple identifier separated by comma (,) or put this option
|
| 51 |
+
# multiple time (only on the command line, not in the configuration file where
|
| 52 |
+
# it should appear only once). See also the "--disable" option for examples.
|
| 53 |
+
#enable=
|
| 54 |
+
|
| 55 |
+
# Disable the message, report, category or checker with the given id(s). You
|
| 56 |
+
# can either give multiple identifiers separated by comma (,) or put this
|
| 57 |
+
# option multiple times (only on the command line, not in the configuration
|
| 58 |
+
# file where it should appear only once).You can also use "--disable=all" to
|
| 59 |
+
# disable everything first and then reenable specific checks. For example, if
|
| 60 |
+
# you want to run only the similarities checker, you can use "--disable=all
|
| 61 |
+
# --enable=similarities". If you want to run only the classes checker, but have
|
| 62 |
+
# no Warning level messages displayed, use"--disable=all --enable=classes
|
| 63 |
+
# --disable=W"
|
| 64 |
+
disable=abstract-method,
|
| 65 |
+
apply-builtin,
|
| 66 |
+
arguments-differ,
|
| 67 |
+
attribute-defined-outside-init,
|
| 68 |
+
backtick,
|
| 69 |
+
bad-option-value,
|
| 70 |
+
basestring-builtin,
|
| 71 |
+
buffer-builtin,
|
| 72 |
+
c-extension-no-member,
|
| 73 |
+
chained-comparison,
|
| 74 |
+
cmp-builtin,
|
| 75 |
+
cmp-method,
|
| 76 |
+
coerce-builtin,
|
| 77 |
+
coerce-method,
|
| 78 |
+
consider-iterating-dictionary,
|
| 79 |
+
consider-using-enumerate,
|
| 80 |
+
consider-using-in,
|
| 81 |
+
delslice-method,
|
| 82 |
+
div-method,
|
| 83 |
+
duplicate-code,
|
| 84 |
+
eq-without-hash,
|
| 85 |
+
execfile-builtin,
|
| 86 |
+
file-builtin,
|
| 87 |
+
filter-builtin-not-iterating,
|
| 88 |
+
fixme,
|
| 89 |
+
getslice-method,
|
| 90 |
+
global-statement,
|
| 91 |
+
hex-method,
|
| 92 |
+
idiv-method,
|
| 93 |
+
implicit-str-concat-in-sequence,
|
| 94 |
+
import-error,
|
| 95 |
+
import-self,
|
| 96 |
+
import-star-module-level,
|
| 97 |
+
inconsistent-return-statements,
|
| 98 |
+
input-builtin,
|
| 99 |
+
intern-builtin,
|
| 100 |
+
invalid-field-call,
|
| 101 |
+
invalid-str-codec,
|
| 102 |
+
locally-disabled,
|
| 103 |
+
long-builtin,
|
| 104 |
+
long-suffix,
|
| 105 |
+
map-builtin-not-iterating,
|
| 106 |
+
metaclass-assignment,
|
| 107 |
+
misplaced-comparison-constant,
|
| 108 |
+
missing-function-docstring,
|
| 109 |
+
missing-module-docstring,
|
| 110 |
+
next-method-called,
|
| 111 |
+
next-method-defined,
|
| 112 |
+
no-absolute-import,
|
| 113 |
+
no-else-break,
|
| 114 |
+
no-else-continue,
|
| 115 |
+
no-else-raise,
|
| 116 |
+
no-else-return,
|
| 117 |
+
no-init, # added
|
| 118 |
+
no-member,
|
| 119 |
+
no-name-in-module,
|
| 120 |
+
no-self-use,
|
| 121 |
+
nonzero-method,
|
| 122 |
+
not-an-iterable, # false positives around dataclasses
|
| 123 |
+
not-callable, # false positives for jax.jit
|
| 124 |
+
oct-method,
|
| 125 |
+
old-division,
|
| 126 |
+
old-ne-operator,
|
| 127 |
+
old-octal-literal,
|
| 128 |
+
old-raise-syntax,
|
| 129 |
+
parameter-unpacking,
|
| 130 |
+
print-statement,
|
| 131 |
+
raising-string,
|
| 132 |
+
range-builtin-not-iterating,
|
| 133 |
+
raw_input-builtin,
|
| 134 |
+
rdiv-method,
|
| 135 |
+
reduce-builtin,
|
| 136 |
+
relative-import,
|
| 137 |
+
reload-builtin,
|
| 138 |
+
round-builtin,
|
| 139 |
+
setslice-method,
|
| 140 |
+
signature-differs,
|
| 141 |
+
standarderror-builtin,
|
| 142 |
+
suppressed-message,
|
| 143 |
+
sys-max-int,
|
| 144 |
+
too-few-public-methods,
|
| 145 |
+
too-many-ancestors,
|
| 146 |
+
too-many-arguments,
|
| 147 |
+
too-many-boolean-expressions,
|
| 148 |
+
too-many-branches,
|
| 149 |
+
too-many-instance-attributes,
|
| 150 |
+
too-many-locals,
|
| 151 |
+
too-many-nested-blocks,
|
| 152 |
+
too-many-positional-arguments,
|
| 153 |
+
too-many-public-methods,
|
| 154 |
+
too-many-return-statements,
|
| 155 |
+
too-many-statements,
|
| 156 |
+
trailing-newlines,
|
| 157 |
+
unichr-builtin,
|
| 158 |
+
unicode-builtin,
|
| 159 |
+
unnecessary-comprehension,
|
| 160 |
+
unnecessary-lambda-assignment,
|
| 161 |
+
unnecessary-pass,
|
| 162 |
+
unpacking-in-except,
|
| 163 |
+
use-dict-literal,
|
| 164 |
+
useless-else-on-loop,
|
| 165 |
+
useless-object-inheritance,
|
| 166 |
+
useless-suppression,
|
| 167 |
+
using-cmp-argument,
|
| 168 |
+
wrong-import-order,
|
| 169 |
+
xrange-builtin,
|
| 170 |
+
zip-builtin-not-iterating,
|
| 171 |
+
|
| 172 |
+
[REPORTS]
|
| 173 |
+
|
| 174 |
+
# Set the output format. Available formats are text, parseable, colorized, msvs
|
| 175 |
+
# (visual studio) and html. You can also give a reporter class, eg
|
| 176 |
+
# mypackage.mymodule.MyReporterClass.
|
| 177 |
+
output-format=text
|
| 178 |
+
|
| 179 |
+
# Tells whether to display a full report or only the messages
|
| 180 |
+
reports=no
|
| 181 |
+
|
| 182 |
+
# Python expression which should return a note less than 10 (10 is the highest
|
| 183 |
+
# note). You have access to the variables errors warning, statement which
|
| 184 |
+
# respectively contain the number of errors / warnings messages and the total
|
| 185 |
+
# number of statements analyzed. This is used by the global evaluation report
|
| 186 |
+
# (RP0004).
|
| 187 |
+
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
| 188 |
+
|
| 189 |
+
# Template used to display messages. This is a python new-style format string
|
| 190 |
+
# used to format the message information. See doc for all details
|
| 191 |
+
#msg-template=
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
[BASIC]
|
| 195 |
+
|
| 196 |
+
# Good variable names which should always be accepted, separated by a comma
|
| 197 |
+
good-names=main,_
|
| 198 |
+
|
| 199 |
+
# Bad variable names which should always be refused, separated by a comma
|
| 200 |
+
bad-names=
|
| 201 |
+
|
| 202 |
+
# Colon-delimited sets of names that determine each other's naming style when
|
| 203 |
+
# the name regexes allow several styles.
|
| 204 |
+
name-group=
|
| 205 |
+
|
| 206 |
+
# Include a hint for the correct naming format with invalid-name
|
| 207 |
+
include-naming-hint=no
|
| 208 |
+
|
| 209 |
+
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
| 210 |
+
# to this list to register other decorators that produce valid properties.
|
| 211 |
+
property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
|
| 212 |
+
|
| 213 |
+
# Regular expression matching correct function names
|
| 214 |
+
function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
|
| 215 |
+
|
| 216 |
+
# Regular expression matching correct variable names
|
| 217 |
+
variable-rgx=^[a-z][a-z0-9_]*$
|
| 218 |
+
|
| 219 |
+
# Regular expression matching correct constant names
|
| 220 |
+
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
| 221 |
+
|
| 222 |
+
# Regular expression matching correct attribute names
|
| 223 |
+
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
|
| 224 |
+
|
| 225 |
+
# Regular expression matching correct argument names
|
| 226 |
+
argument-rgx=^[a-z][a-z0-9_]*$
|
| 227 |
+
|
| 228 |
+
# Regular expression matching correct class attribute names
|
| 229 |
+
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
| 230 |
+
|
| 231 |
+
# Regular expression matching correct inline iteration names
|
| 232 |
+
inlinevar-rgx=^[a-z][a-z0-9_]*$
|
| 233 |
+
|
| 234 |
+
# Regular expression matching correct class names
|
| 235 |
+
class-rgx=^_?[A-Z][a-zA-Z0-9]*$
|
| 236 |
+
|
| 237 |
+
# Regular expression matching correct module names
|
| 238 |
+
module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
|
| 239 |
+
|
| 240 |
+
# Regular expression matching correct method names
|
| 241 |
+
method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
|
| 242 |
+
|
| 243 |
+
# Regular expression which should only match function or class names that do
|
| 244 |
+
# not require a docstring.
|
| 245 |
+
no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
|
| 246 |
+
|
| 247 |
+
# Minimum line length for functions/classes that require docstrings, shorter
|
| 248 |
+
# ones are exempt.
|
| 249 |
+
docstring-min-length=10
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
[TYPECHECK]
|
| 253 |
+
|
| 254 |
+
# List of decorators that produce context managers, such as
|
| 255 |
+
# contextlib.contextmanager. Add to this list to register other decorators that
|
| 256 |
+
# produce valid context managers.
|
| 257 |
+
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
|
| 258 |
+
|
| 259 |
+
# Tells whether missing members accessed in mixin class should be ignored. A
|
| 260 |
+
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
| 261 |
+
ignore-mixin-members=yes
|
| 262 |
+
|
| 263 |
+
# List of module names for which member attributes should not be checked
|
| 264 |
+
# (useful for modules/projects where namespaces are manipulated during runtime
|
| 265 |
+
# and thus existing member attributes cannot be deduced by static analysis. It
|
| 266 |
+
# supports qualified module names, as well as Unix pattern matching.
|
| 267 |
+
ignored-modules=
|
| 268 |
+
|
| 269 |
+
# List of class names for which member attributes should not be checked (useful
|
| 270 |
+
# for classes with dynamically set attributes). This supports the use of
|
| 271 |
+
# qualified names.
|
| 272 |
+
ignored-classes=optparse.Values,thread._local,_thread._local
|
| 273 |
+
|
| 274 |
+
# List of members which are set dynamically and missed by pylint inference
|
| 275 |
+
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
| 276 |
+
# expressions are accepted.
|
| 277 |
+
generated-members=
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
[FORMAT]
|
| 281 |
+
|
| 282 |
+
# Maximum number of characters on a single line.
|
| 283 |
+
max-line-length=80
|
| 284 |
+
|
| 285 |
+
# lines made too long by directives to pytype.
|
| 286 |
+
|
| 287 |
+
# Regexp for a line that is allowed to be longer than the limit.
|
| 288 |
+
ignore-long-lines=(?x)
|
| 289 |
+
(^\s*(import|from)\s
|
| 290 |
+
|\$Id:\s\/\/depot\/.+#\d+\s\$
|
| 291 |
+
|^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
|
| 292 |
+
|^\s*\#\ LINT\.ThenChange
|
| 293 |
+
|^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
|
| 294 |
+
|pylint
|
| 295 |
+
|"""
|
| 296 |
+
|\#
|
| 297 |
+
|lambda
|
| 298 |
+
|(https?|ftp):)
|
| 299 |
+
|
| 300 |
+
# Allow the body of an if to be on the same line as the test if there is no
|
| 301 |
+
# else.
|
| 302 |
+
single-line-if-stmt=yes
|
| 303 |
+
|
| 304 |
+
# Maximum number of lines in a module
|
| 305 |
+
max-module-lines=99999
|
| 306 |
+
|
| 307 |
+
# String used as indentation unit. The internal Google style guide mandates 2
|
| 308 |
+
# spaces. Google's externaly-published style guide says 4, consistent with
|
| 309 |
+
# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google
|
| 310 |
+
# projects (like TensorFlow).
|
| 311 |
+
indent-string=' '
|
| 312 |
+
|
| 313 |
+
# Number of spaces of indent required inside a hanging or continued line.
|
| 314 |
+
indent-after-paren=4
|
| 315 |
+
|
| 316 |
+
# Expected format of line ending, e.g., empty (any line ending), LF or CRLF.
|
| 317 |
+
expected-line-ending-format=
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
[MISCELLANEOUS]
|
| 321 |
+
|
| 322 |
+
# List of note tags to take in consideration, separated by a comma.
|
| 323 |
+
notes=TODO
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
[STRING]
|
| 327 |
+
|
| 328 |
+
# This flag controls whether inconsistent-quotes generates a warning when the
|
| 329 |
+
# character used as a quote delimiter is used inconsistently within a module.
|
| 330 |
+
check-quote-consistency=yes
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
[VARIABLES]
|
| 334 |
+
|
| 335 |
+
# Tells whether we should check for unused import in __init__ files.
|
| 336 |
+
init-import=no
|
| 337 |
+
|
| 338 |
+
# A regular expression matching the name of dummy variables (i.e., expectedly
|
| 339 |
+
# not used).
|
| 340 |
+
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
|
| 341 |
+
|
| 342 |
+
# List of additional names supposed to be defined in builtins. Remember that
|
| 343 |
+
# you should avoid to define new builtins when possible.
|
| 344 |
+
additional-builtins=
|
| 345 |
+
|
| 346 |
+
# List of strings which can identify a callback function by name. A callback
|
| 347 |
+
# name must start or end with one of those strings.
|
| 348 |
+
callbacks=cb_,_cb
|
| 349 |
+
|
| 350 |
+
# List of qualified module names which can have objects that can redefine
|
| 351 |
+
# builtins.
|
| 352 |
+
redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
[LOGGING]
|
| 356 |
+
|
| 357 |
+
# Logging modules to check that the string format arguments are in logging
|
| 358 |
+
# function parameter format
|
| 359 |
+
logging-modules=logging,absl.logging,tensorflow.io.logging
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
[SIMILARITIES]
|
| 363 |
+
|
| 364 |
+
# Minimum lines number of a similarity.
|
| 365 |
+
min-similarity-lines=4
|
| 366 |
+
|
| 367 |
+
# Ignore comments when computing similarities.
|
| 368 |
+
ignore-comments=yes
|
| 369 |
+
|
| 370 |
+
# Ignore docstrings when computing similarities.
|
| 371 |
+
ignore-docstrings=yes
|
| 372 |
+
|
| 373 |
+
# Ignore imports when computing similarities.
|
| 374 |
+
ignore-imports=no
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
[SPELLING]
|
| 378 |
+
|
| 379 |
+
# Spelling dictionary name. Available dictionaries: none. To make it working
|
| 380 |
+
# install python-enchant package.
|
| 381 |
+
spelling-dict=
|
| 382 |
+
|
| 383 |
+
# List of comma separated words that should not be checked.
|
| 384 |
+
spelling-ignore-words=
|
| 385 |
+
|
| 386 |
+
# A path to a file that contains private dictionary; one word per line.
|
| 387 |
+
spelling-private-dict-file=
|
| 388 |
+
|
| 389 |
+
# Tells whether to store unknown words to indicated private dictionary in
|
| 390 |
+
# --spelling-private-dict-file option instead of raising a message.
|
| 391 |
+
spelling-store-unknown-words=no
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
[IMPORTS]
|
| 395 |
+
|
| 396 |
+
# Deprecated modules which should not be used, separated by a comma
|
| 397 |
+
deprecated-modules=regsub,
|
| 398 |
+
TERMIOS,
|
| 399 |
+
Bastion,
|
| 400 |
+
rexec,
|
| 401 |
+
sets
|
| 402 |
+
|
| 403 |
+
# Create a graph of every (i.e., internal and external) dependencies in the
|
| 404 |
+
# given file (report RP0402 must not be disabled)
|
| 405 |
+
import-graph=
|
| 406 |
+
|
| 407 |
+
# Create a graph of external dependencies in the given file (report RP0402 must
|
| 408 |
+
# not be disabled)
|
| 409 |
+
ext-import-graph=
|
| 410 |
+
|
| 411 |
+
# Create a graph of internal dependencies in the given file (report RP0402 must
|
| 412 |
+
# not be disabled)
|
| 413 |
+
int-import-graph=
|
| 414 |
+
|
| 415 |
+
# Force import order to recognize a module as part of the standard
|
| 416 |
+
# compatibility libraries.
|
| 417 |
+
known-standard-library=
|
| 418 |
+
|
| 419 |
+
# Force import order to recognize a module as part of a third party library.
|
| 420 |
+
known-third-party=enchant, absl
|
| 421 |
+
|
| 422 |
+
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
| 423 |
+
# 3 compatible code, which means that the block might have code that exists
|
| 424 |
+
# only in one or another interpreter, leading to false positives when analysed.
|
| 425 |
+
analyse-fallback-blocks=no
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
[CLASSES]
|
| 429 |
+
|
| 430 |
+
# List of method names used to declare (i.e., assign) instance attributes.
|
| 431 |
+
defining-attr-methods=__init__,
|
| 432 |
+
__new__,
|
| 433 |
+
setUp
|
| 434 |
+
|
| 435 |
+
# List of member names, which should be excluded from the protected access
|
| 436 |
+
# warning.
|
| 437 |
+
exclude-protected=_asdict,
|
| 438 |
+
_fields,
|
| 439 |
+
_replace,
|
| 440 |
+
_source,
|
| 441 |
+
_make
|
| 442 |
+
|
| 443 |
+
# List of valid names for the first argument in a class method.
|
| 444 |
+
valid-classmethod-first-arg=cls,
|
| 445 |
+
class_
|
| 446 |
+
|
| 447 |
+
# List of valid names for the first argument in a metaclass class method.
|
| 448 |
+
valid-metaclass-classmethod-first-arg=mcs
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
[EXCEPTIONS]
|
| 452 |
+
|
| 453 |
+
# Exceptions that will emit a warning when being caught. Defaults to
|
| 454 |
+
# "Exception"
|
| 455 |
+
overgeneral-exceptions=builtins.StandardError,
|
| 456 |
+
builtins.Exception,
|
| 457 |
+
builtins.BaseException
|
| 458 |
+
|
alphagenome/source/.readthedocs.yaml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Read the Docs configuration file for Sphinx projects
|
| 16 |
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
| 17 |
+
|
| 18 |
+
# Required
|
| 19 |
+
version: 2
|
| 20 |
+
|
| 21 |
+
# Set the OS, Python version and other tools you might need
|
| 22 |
+
build:
|
| 23 |
+
os: ubuntu-22.04
|
| 24 |
+
tools:
|
| 25 |
+
python: "3.10"
|
| 26 |
+
jobs:
|
| 27 |
+
pre_build:
|
| 28 |
+
# Copy colabs into docs/source so they can be included in the documentation.
|
| 29 |
+
- cp -r colabs docs/source/
|
| 30 |
+
|
| 31 |
+
# Build documentation in the "docs/" directory with Sphinx
|
| 32 |
+
sphinx:
|
| 33 |
+
builder: html
|
| 34 |
+
configuration: docs/source/conf.py
|
| 35 |
+
fail_on_warning: false
|
| 36 |
+
|
| 37 |
+
python:
|
| 38 |
+
install:
|
| 39 |
+
- method: pip
|
| 40 |
+
path: .
|
| 41 |
+
extra_requirements:
|
| 42 |
+
- docs
|
alphagenome/source/CHANGELOG.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to this project will be documented in this file.
|
| 4 |
+
|
| 5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
| 6 |
+
and this project adheres to
|
| 7 |
+
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 8 |
+
|
| 9 |
+
## [0.2.0]
|
| 10 |
+
|
| 11 |
+
### Added
|
| 12 |
+
|
| 13 |
+
- Add `is_insertion` and `is_deletion` properties to `Variant`.
|
| 14 |
+
- Add `DnaModel` abstract base class.
|
| 15 |
+
- Add support for center mask scoring over the entire sequence by passing
|
| 16 |
+
`None` for width.
|
| 17 |
+
|
| 18 |
+
### Changed
|
| 19 |
+
|
| 20 |
+
- Move RPC requests and responses to `dna_model_service.proto`.
|
| 21 |
+
- Move functionality to convert `TrackData` to/from protocol buffers to
|
| 22 |
+
utility module.
|
| 23 |
+
|
| 24 |
+
## [0.1.0]
|
| 25 |
+
|
| 26 |
+
### Added
|
| 27 |
+
|
| 28 |
+
- Add `L2_DIFF_LOG1P` variant scoring aggregation type.
|
| 29 |
+
- Add `is_snv` property to `Variant`.
|
| 30 |
+
- Add non-zero mean track metadata field to model output metadata.
|
| 31 |
+
- Add optional interval argument to `predict_sequence`.
|
| 32 |
+
|
| 33 |
+
## [0.0.2]
|
| 34 |
+
|
| 35 |
+
### Added
|
| 36 |
+
|
| 37 |
+
- `colab_utils` module to wrap reading API keys from environment variables or
|
| 38 |
+
Google Colab secrets.
|
| 39 |
+
|
| 40 |
+
## [0.0.1]
|
| 41 |
+
|
| 42 |
+
Initial release.
|
alphagenome/source/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# How to Contribute
|
| 2 |
+
|
| 3 |
+
## Contributor License Agreement
|
| 4 |
+
|
| 5 |
+
Contributions to this project must be accompanied by a Contributor License
|
| 6 |
+
Agreement. You (or your employer) retain the copyright to your contribution,
|
| 7 |
+
this simply gives us permission to use and redistribute your contributions as
|
| 8 |
+
part of the project. Head over to <https://cla.developers.google.com/> to see
|
| 9 |
+
your current agreements on file or to sign a new one.
|
| 10 |
+
|
| 11 |
+
You generally only need to submit a CLA once, so if you've already submitted one
|
| 12 |
+
(even if it was for a different project), you probably don't need to do it
|
| 13 |
+
again.
|
| 14 |
+
|
| 15 |
+
## Code reviews
|
| 16 |
+
|
| 17 |
+
All submissions, including submissions by project members, require review. We
|
| 18 |
+
use GitHub pull requests for this purpose. Consult
|
| 19 |
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
| 20 |
+
information on using pull requests.
|
| 21 |
+
|
| 22 |
+
## Community Guidelines
|
| 23 |
+
|
| 24 |
+
This project follows [Google's Open Source Community
|
| 25 |
+
Guidelines](https://opensource.google/conduct/).
|
alphagenome/source/LICENSE
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
Apache License
|
| 3 |
+
Version 2.0, January 2004
|
| 4 |
+
http://www.apache.org/licenses/
|
| 5 |
+
|
| 6 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 7 |
+
|
| 8 |
+
1. Definitions.
|
| 9 |
+
|
| 10 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 11 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 12 |
+
|
| 13 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 14 |
+
the copyright owner that is granting the License.
|
| 15 |
+
|
| 16 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 17 |
+
other entities that control, are controlled by, or are under common
|
| 18 |
+
control with that entity. For the purposes of this definition,
|
| 19 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 20 |
+
direction or management of such entity, whether by contract or
|
| 21 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 22 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 23 |
+
|
| 24 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 25 |
+
exercising permissions granted by this License.
|
| 26 |
+
|
| 27 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 28 |
+
including but not limited to software source code, documentation
|
| 29 |
+
source, and configuration files.
|
| 30 |
+
|
| 31 |
+
"Object" form shall mean any form resulting from mechanical
|
| 32 |
+
transformation or translation of a Source form, including but
|
| 33 |
+
not limited to compiled object code, generated documentation,
|
| 34 |
+
and conversions to other media types.
|
| 35 |
+
|
| 36 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 37 |
+
Object form, made available under the License, as indicated by a
|
| 38 |
+
copyright notice that is included in or attached to the work
|
| 39 |
+
(an example is provided in the Appendix below).
|
| 40 |
+
|
| 41 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 42 |
+
form, that is based on (or derived from) the Work and for which the
|
| 43 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 44 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 45 |
+
of this License, Derivative Works shall not include works that remain
|
| 46 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 47 |
+
the Work and Derivative Works thereof.
|
| 48 |
+
|
| 49 |
+
"Contribution" shall mean any work of authorship, including
|
| 50 |
+
the original version of the Work and any modifications or additions
|
| 51 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 52 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 53 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 54 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 55 |
+
means any form of electronic, verbal, or written communication sent
|
| 56 |
+
to the Licensor or its representatives, including but not limited to
|
| 57 |
+
communication on electronic mailing lists, source code control systems,
|
| 58 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 59 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 60 |
+
excluding communication that is conspicuously marked or otherwise
|
| 61 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 62 |
+
|
| 63 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 64 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 65 |
+
subsequently incorporated within the Work.
|
| 66 |
+
|
| 67 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 68 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 69 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 70 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 71 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 72 |
+
Work and such Derivative Works in Source or Object form.
|
| 73 |
+
|
| 74 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 75 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 76 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 77 |
+
(except as stated in this section) patent license to make, have made,
|
| 78 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 79 |
+
where such license applies only to those patent claims licensable
|
| 80 |
+
by such Contributor that are necessarily infringed by their
|
| 81 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 82 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 83 |
+
institute patent litigation against any entity (including a
|
| 84 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 85 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 86 |
+
or contributory patent infringement, then any patent licenses
|
| 87 |
+
granted to You under this License for that Work shall terminate
|
| 88 |
+
as of the date such litigation is filed.
|
| 89 |
+
|
| 90 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 91 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 92 |
+
modifications, and in Source or Object form, provided that You
|
| 93 |
+
meet the following conditions:
|
| 94 |
+
|
| 95 |
+
(a) You must give any other recipients of the Work or
|
| 96 |
+
Derivative Works a copy of this License; and
|
| 97 |
+
|
| 98 |
+
(b) You must cause any modified files to carry prominent notices
|
| 99 |
+
stating that You changed the files; and
|
| 100 |
+
|
| 101 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 102 |
+
that You distribute, all copyright, patent, trademark, and
|
| 103 |
+
attribution notices from the Source form of the Work,
|
| 104 |
+
excluding those notices that do not pertain to any part of
|
| 105 |
+
the Derivative Works; and
|
| 106 |
+
|
| 107 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 108 |
+
distribution, then any Derivative Works that You distribute must
|
| 109 |
+
include a readable copy of the attribution notices contained
|
| 110 |
+
within such NOTICE file, excluding those notices that do not
|
| 111 |
+
pertain to any part of the Derivative Works, in at least one
|
| 112 |
+
of the following places: within a NOTICE text file distributed
|
| 113 |
+
as part of the Derivative Works; within the Source form or
|
| 114 |
+
documentation, if provided along with the Derivative Works; or,
|
| 115 |
+
within a display generated by the Derivative Works, if and
|
| 116 |
+
wherever such third-party notices normally appear. The contents
|
| 117 |
+
of the NOTICE file are for informational purposes only and
|
| 118 |
+
do not modify the License. You may add Your own attribution
|
| 119 |
+
notices within Derivative Works that You distribute, alongside
|
| 120 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 121 |
+
that such additional attribution notices cannot be construed
|
| 122 |
+
as modifying the License.
|
| 123 |
+
|
| 124 |
+
You may add Your own copyright statement to Your modifications and
|
| 125 |
+
may provide additional or different license terms and conditions
|
| 126 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 127 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 128 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 129 |
+
the conditions stated in this License.
|
| 130 |
+
|
| 131 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 132 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 133 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 134 |
+
this License, without any additional terms or conditions.
|
| 135 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 136 |
+
the terms of any separate license agreement you may have executed
|
| 137 |
+
with Licensor regarding such Contributions.
|
| 138 |
+
|
| 139 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 140 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 141 |
+
except as required for reasonable and customary use in describing the
|
| 142 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 143 |
+
|
| 144 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 145 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 146 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 147 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 148 |
+
implied, including, without limitation, any warranties or conditions
|
| 149 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 150 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 151 |
+
appropriateness of using or redistributing the Work and assume any
|
| 152 |
+
risks associated with Your exercise of permissions under this License.
|
| 153 |
+
|
| 154 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 155 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 156 |
+
unless required by applicable law (such as deliberate and grossly
|
| 157 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 158 |
+
liable to You for damages, including any direct, indirect, special,
|
| 159 |
+
incidental, or consequential damages of any character arising as a
|
| 160 |
+
result of this License or out of the use or inability to use the
|
| 161 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 162 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 163 |
+
other commercial damages or losses), even if such Contributor
|
| 164 |
+
has been advised of the possibility of such damages.
|
| 165 |
+
|
| 166 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 167 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 168 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 169 |
+
or other liability obligations and/or rights consistent with this
|
| 170 |
+
License. However, in accepting such obligations, You may act only
|
| 171 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 172 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 173 |
+
defend, and hold each Contributor harmless for any liability
|
| 174 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 175 |
+
of your accepting any such warranty or additional liability.
|
| 176 |
+
|
| 177 |
+
END OF TERMS AND CONDITIONS
|
| 178 |
+
|
| 179 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 180 |
+
|
| 181 |
+
To apply the Apache License to your work, attach the following
|
| 182 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 183 |
+
replaced with your own identifying information. (Don't include
|
| 184 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 185 |
+
comment syntax for the file format. We also recommend that a
|
| 186 |
+
file or class name and description of purpose be included on the
|
| 187 |
+
same "printed page" as the copyright notice for easier
|
| 188 |
+
identification within third-party archives.
|
| 189 |
+
|
| 190 |
+
Copyright [yyyy] [name of copyright owner]
|
| 191 |
+
|
| 192 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 193 |
+
you may not use this file except in compliance with the License.
|
| 194 |
+
You may obtain a copy of the License at
|
| 195 |
+
|
| 196 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 197 |
+
|
| 198 |
+
Unless required by applicable law or agreed to in writing, software
|
| 199 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 200 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 201 |
+
See the License for the specific language governing permissions and
|
| 202 |
+
limitations under the License.
|
alphagenome/source/README.md
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+

|
| 2 |
+
|
| 3 |
+
# AlphaGenome
|
| 4 |
+
|
| 5 |
+

|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
[**Get API key**](https://deepmind.google.com/science/alphagenome) |
|
| 9 |
+
[**Quick start**](#quick-start) | [**Installation**](#installation) |
|
| 10 |
+
[**Documentation**](https://www.alphagenomedocs.com/) |
|
| 11 |
+
[**Community**](https://www.alphagenomecommunity.com) |
|
| 12 |
+
[**Terms of Use**](https://deepmind.google.com/science/alphagenome/terms)
|
| 13 |
+
|
| 14 |
+
The AlphaGenome API provides access to AlphaGenome, Google DeepMind’s unifying
|
| 15 |
+
model for deciphering the regulatory code within DNA sequences. This repository
|
| 16 |
+
contains client-side code, examples and documentation to help you use the
|
| 17 |
+
AlphaGenome API.
|
| 18 |
+
|
| 19 |
+
AlphaGenome offers multimodal predictions, encompassing diverse functional
|
| 20 |
+
outputs such as gene expression, splicing patterns, chromatin features, and
|
| 21 |
+
contact maps (see [diagram below](#model_overview)). The model analyzes DNA
|
| 22 |
+
sequences of up to 1 million base pairs in length and can deliver predictions at
|
| 23 |
+
single base-pair resolution for most outputs. AlphaGenome achieves
|
| 24 |
+
state-of-the-art performance across a range of genomic prediction benchmarks,
|
| 25 |
+
including numerous diverse variant effect prediction tasks (detailed in
|
| 26 |
+
[Avsec et al. 2025](https://doi.org/10.1101/2025.06.25.661532)).
|
| 27 |
+
|
| 28 |
+
The API is offered free of charge for
|
| 29 |
+
[non-commercial use](https://deepmind.google.com/science/alphagenome/terms)
|
| 30 |
+
(subject to the terms of use). Query rates vary based on demand – it is well
|
| 31 |
+
suited for smaller to medium-scale analyses such as analysing a limited number
|
| 32 |
+
of genomic regions or variants requiring 1000s of predictions, but is likely not
|
| 33 |
+
suitable for large scale analyses requiring more than 1 million predictions.
|
| 34 |
+
Once you obtain your API key, you can easily get started by following our
|
| 35 |
+
[Quick Start Guide](#quick-start), or watching our
|
| 36 |
+
[AlphaGenome 101 tutorial](https://youtu.be/Xbvloe13nak).
|
| 37 |
+
|
| 38 |
+
<a id='model_overview'>
|
| 39 |
+
|
| 40 |
+

|
| 41 |
+
|
| 42 |
+
</a>
|
| 43 |
+
|
| 44 |
+
The documentation also covers a set of comprehensive tutorials, variant scoring
|
| 45 |
+
strategies to efficiently score variant effects, and a visualization library to
|
| 46 |
+
generate `matplotlib` figures for the different output modalities.
|
| 47 |
+
|
| 48 |
+
We cover additional details of the capabilities and limitations in our
|
| 49 |
+
documentation. For support and feedback:
|
| 50 |
+
|
| 51 |
+
- Please submit bugs and any code-related issues on
|
| 52 |
+
[GitHub](https://github.com/google-deepmind/alphagenome/issues).
|
| 53 |
+
- For general feedback, questions about usage, and/or feature requests, please
|
| 54 |
+
use the [community forum](https://www.alphagenomecommunity.com) – it’s
|
| 55 |
+
actively monitored by our team so you're likely to find answers and insights
|
| 56 |
+
faster.
|
| 57 |
+
- If you can't find what you're looking for, please get in touch with the
|
| 58 |
+
AlphaGenome team on alphagenome@google.com and we will be happy to assist
|
| 59 |
+
you with questions. We’re working hard to answer all inquiries but there may
|
| 60 |
+
be a short delay in our response due to the high volume we are receiving.
|
| 61 |
+
|
| 62 |
+
## Quick start
|
| 63 |
+
|
| 64 |
+
The quickest way to get started is to run our example notebooks in
|
| 65 |
+
[Google Colab](https://colab.research.google.com/). Here are some starter
|
| 66 |
+
notebooks:
|
| 67 |
+
|
| 68 |
+
- [Quick start](https://colab.research.google.com/github/google-deepmind/alphagenome/blob/main/colabs/quick_start.ipynb):
|
| 69 |
+
An introduction to quickly get you started with using the model and making
|
| 70 |
+
predictions.
|
| 71 |
+
- [Visualizing predictions](https://colab.research.google.com/github/google-deepmind/alphagenome/blob/main/colabs/visualization_modality_tour.ipynb):
|
| 72 |
+
Learn how to visualize different model predictions using the visualization
|
| 73 |
+
libraries.
|
| 74 |
+
|
| 75 |
+
Alternatively, you can dive straight in by following the
|
| 76 |
+
[installation guide](#installation) and start writing code! Here's an example of
|
| 77 |
+
making a variant prediction:
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
from alphagenome.data import genome
|
| 81 |
+
from alphagenome.models import dna_client
|
| 82 |
+
from alphagenome.visualization import plot_components
|
| 83 |
+
import matplotlib.pyplot as plt
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
API_KEY = 'MyAPIKey'
|
| 87 |
+
model = dna_client.create(API_KEY)
|
| 88 |
+
|
| 89 |
+
interval = genome.Interval(chromosome='chr22', start=35677410, end=36725986)
|
| 90 |
+
variant = genome.Variant(
|
| 91 |
+
chromosome='chr22',
|
| 92 |
+
position=36201698,
|
| 93 |
+
reference_bases='A',
|
| 94 |
+
alternate_bases='C',
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
outputs = model.predict_variant(
|
| 98 |
+
interval=interval,
|
| 99 |
+
variant=variant,
|
| 100 |
+
ontology_terms=['UBERON:0001157'],
|
| 101 |
+
requested_outputs=[dna_client.OutputType.RNA_SEQ],
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
plot_components.plot(
|
| 105 |
+
[
|
| 106 |
+
plot_components.OverlaidTracks(
|
| 107 |
+
tdata={
|
| 108 |
+
'REF': outputs.reference.rna_seq,
|
| 109 |
+
'ALT': outputs.alternate.rna_seq,
|
| 110 |
+
},
|
| 111 |
+
colors={'REF': 'dimgrey', 'ALT': 'red'},
|
| 112 |
+
),
|
| 113 |
+
],
|
| 114 |
+
interval=outputs.reference.rna_seq.interval.resize(2**15),
|
| 115 |
+
# Annotate the location of the variant as a vertical line.
|
| 116 |
+
annotations=[plot_components.VariantAnnotation([variant], alpha=0.8)],
|
| 117 |
+
)
|
| 118 |
+
plt.show()
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
## Installation
|
| 122 |
+
|
| 123 |
+
<!-- mdformat off(disable for [!TIP] format) -->
|
| 124 |
+
|
| 125 |
+
> [!TIP]
|
| 126 |
+
> You may optionally wish to create a
|
| 127 |
+
> [Python Virtual Environment](https://docs.python.org/3/tutorial/venv.html) to
|
| 128 |
+
> prevent conflicts with your system's Python environment.
|
| 129 |
+
|
| 130 |
+
<!-- mdformat on -->
|
| 131 |
+
|
| 132 |
+
To install `alphagenome`, clone a local copy of the repository and run `pip
|
| 133 |
+
install`:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
$ git clone https://github.com/google-deepmind/alphagenome.git
|
| 137 |
+
$ pip install ./alphagenome
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
See [the documentation](https://www.alphagenomedocs.com/installation.html) for
|
| 141 |
+
information on alternative installation strategies.
|
| 142 |
+
|
| 143 |
+
## Citing `alphagenome`
|
| 144 |
+
|
| 145 |
+
If you use AlphaGenome in your research, please cite using:
|
| 146 |
+
|
| 147 |
+
<!-- disableFinding(SNIPPET_INVALID_LANGUAGE) -->
|
| 148 |
+
|
| 149 |
+
```bibtex
|
| 150 |
+
@article{alphagenome,
|
| 151 |
+
title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
|
| 152 |
+
author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
|
| 153 |
+
year={2025},
|
| 154 |
+
doi={https://doi.org/10.1101/2025.06.25.661532},
|
| 155 |
+
publisher={Cold Spring Harbor Laboratory},
|
| 156 |
+
journal={bioRxiv}
|
| 157 |
+
}
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
<!-- enableFinding(SNIPPET_INVALID_LANGUAGE) -->
|
| 161 |
+
|
| 162 |
+
## Acknowledgements
|
| 163 |
+
|
| 164 |
+
AlphaGenome communicates with and/or references the following separate libraries
|
| 165 |
+
and packages:
|
| 166 |
+
|
| 167 |
+
* [Abseil](https://github.com/abseil/abseil-py)
|
| 168 |
+
* [anndata](https://github.com/scverse/anndata)
|
| 169 |
+
* [gRPC](https://github.com/grpc/grpc)
|
| 170 |
+
* [immutabledict](https://github.com/corenting/immutabledict)
|
| 171 |
+
* [intervaltree](https://github.com/chaimleib/intervaltree)
|
| 172 |
+
* [jaxtyping](https://github.com/patrick-kidger/jaxtyping)
|
| 173 |
+
* [matplotlib](https://matplotlib.org/)
|
| 174 |
+
* [ml_dtypes](https://github.com/jax-ml/ml_dtypes)
|
| 175 |
+
* [NumPy](https://numpy.org/)
|
| 176 |
+
* [pandas](https://pandas.pydata.org/)
|
| 177 |
+
* [protobuf](https://developers.google.com/protocol-buffers/)
|
| 178 |
+
* [pyarrow](https://arrow.apache.org/)
|
| 179 |
+
* [SciPy](https://scipy.org/)
|
| 180 |
+
* [seaborn](https://seaborn.pydata.org/)
|
| 181 |
+
* [tqdm](https://github.com/tqdm/tqdm)
|
| 182 |
+
* [typeguard](https://github.com/agronholm/typeguard)
|
| 183 |
+
* [typing_extensions](https://github.com/python/typing_extensions)
|
| 184 |
+
* [zstandard](https://github.com/indygreg/python-zstandard)
|
| 185 |
+
|
| 186 |
+
We thank all their contributors and maintainers!
|
| 187 |
+
|
| 188 |
+
## License and Disclaimer
|
| 189 |
+
|
| 190 |
+
Copyright 2024 Google LLC
|
| 191 |
+
|
| 192 |
+
All software in this repository is licensed under the Apache License, Version
|
| 193 |
+
2.0 (Apache 2.0); you may not use this except in compliance with the Apache 2.0
|
| 194 |
+
license. You may obtain a copy of the Apache 2.0 license at:
|
| 195 |
+
https://www.apache.org/licenses/LICENSE-2.0.
|
| 196 |
+
|
| 197 |
+
Examples and documentation to help you use the AlphaGenome API are licensed
|
| 198 |
+
under the Creative Commons Attribution 4.0 International License (CC-BY). You
|
| 199 |
+
may obtain a copy of the CC-BY license at:
|
| 200 |
+
https://creativecommons.org/licenses/by/4.0/legalcode.
|
| 201 |
+
|
| 202 |
+
Unless required by applicable law or agreed to in writing, all software and
|
| 203 |
+
materials distributed here under the Apache 2.0 or CC-BY licenses are
|
| 204 |
+
distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
| 205 |
+
either express or implied. See the licenses for the specific language governing
|
| 206 |
+
permissions and limitations under those licenses.
|
| 207 |
+
|
| 208 |
+
This is not an official Google product.
|
| 209 |
+
|
| 210 |
+
### Third-party software
|
| 211 |
+
|
| 212 |
+
Your use of any third-party software, libraries or code referenced in the
|
| 213 |
+
materials in this repository (including the libraries listed in the
|
| 214 |
+
[Acknowledgments](#acknowledgements) section) may be governed by separate terms
|
| 215 |
+
and conditions or license provisions. Your use of the third-party software,
|
| 216 |
+
libraries or code is subject to any such terms and you should check that you can
|
| 217 |
+
comply with any applicable restrictions or terms and conditions before use.
|
| 218 |
+
|
| 219 |
+
### Reference Datasets
|
| 220 |
+
|
| 221 |
+
A modified version of the GENCODE dataset (which can be found here:
|
| 222 |
+
https://www.gencodegenes.org/human/releases.html) is released with the client
|
| 223 |
+
code package for illustrative purposes, and is available with reference to the
|
| 224 |
+
following:
|
| 225 |
+
|
| 226 |
+
- Copyright © 2024 EMBL-EBI
|
| 227 |
+
- The GENCODE dataset is subject to the EMBL-EBI terms of use, available at
|
| 228 |
+
https://www.ebi.ac.uk/about/terms-of-use.
|
| 229 |
+
- Citation: Frankish A, et al (2018) GENCODE reference annotation for the
|
| 230 |
+
human and mouse genome.
|
| 231 |
+
- Further details about GENCODE can be found at
|
| 232 |
+
https://www.gencodegenes.org/human/releases.html, with additional citation
|
| 233 |
+
information at https://www.gencodegenes.org/pages/publications.html and
|
| 234 |
+
further acknowledgements can be found at
|
| 235 |
+
https://www.gencodegenes.org/pages/gencode.html.
|
alphagenome/source/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
alphagenome Project Package Initialization File
|
| 4 |
+
"""
|
alphagenome/source/colabs/batch_variant_scoring.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
alphagenome/source/colabs/essential_commands.ipynb
ADDED
|
@@ -0,0 +1,1405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"metadata": {
|
| 5 |
+
"id": "LIIksEJ7fbxF"
|
| 6 |
+
},
|
| 7 |
+
"cell_type": "markdown",
|
| 8 |
+
"source": [
|
| 9 |
+
"# Essential commands\n",
|
| 10 |
+
"The following describes essential commands for interacting with the AlphaGenome API. It is broken into two sections: data and methods.\n",
|
| 11 |
+
"\n"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"metadata": {
|
| 16 |
+
"id": "gcms9aHWNnqs"
|
| 17 |
+
},
|
| 18 |
+
"cell_type": "markdown",
|
| 19 |
+
"source": [
|
| 20 |
+
"```{tip}\n",
|
| 21 |
+
"Open this tutorial in Google colab for interactive viewing.\n",
|
| 22 |
+
"```"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"metadata": {
|
| 27 |
+
"executionInfo": {
|
| 28 |
+
"elapsed": 13,
|
| 29 |
+
"status": "ok",
|
| 30 |
+
"timestamp": 1749822645556,
|
| 31 |
+
"user": {
|
| 32 |
+
"displayName": "",
|
| 33 |
+
"userId": ""
|
| 34 |
+
},
|
| 35 |
+
"user_tz": -60
|
| 36 |
+
},
|
| 37 |
+
"id": "iEs6z4rGe3lk"
|
| 38 |
+
},
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"source": [
|
| 41 |
+
"# @title Install AlphaGenome\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"# @markdown Run this cell to install AlphaGenome.\n",
|
| 44 |
+
"from IPython.display import clear_output\n",
|
| 45 |
+
"! pip install alphagenome\n",
|
| 46 |
+
"clear_output()"
|
| 47 |
+
],
|
| 48 |
+
"outputs": [],
|
| 49 |
+
"execution_count": 1
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"metadata": {
|
| 53 |
+
"id": "rKyGK083Wwh7"
|
| 54 |
+
},
|
| 55 |
+
"cell_type": "markdown",
|
| 56 |
+
"source": [
|
| 57 |
+
"# Imports"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"metadata": {
|
| 62 |
+
"executionInfo": {
|
| 63 |
+
"elapsed": 1070,
|
| 64 |
+
"status": "ok",
|
| 65 |
+
"timestamp": 1749822646891,
|
| 66 |
+
"user": {
|
| 67 |
+
"displayName": "",
|
| 68 |
+
"userId": ""
|
| 69 |
+
},
|
| 70 |
+
"user_tz": -60
|
| 71 |
+
},
|
| 72 |
+
"id": "V7MD3DBEfJwf"
|
| 73 |
+
},
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"source": [
|
| 76 |
+
"from alphagenome.data import genome\n",
|
| 77 |
+
"from alphagenome.models import dna_client\n",
|
| 78 |
+
"import numpy as np\n",
|
| 79 |
+
"import pandas as pd\n",
|
| 80 |
+
"from google.colab import userdata"
|
| 81 |
+
],
|
| 82 |
+
"outputs": [],
|
| 83 |
+
"execution_count": 2
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"metadata": {
|
| 87 |
+
"id": "dzkwq2tyfj0q"
|
| 88 |
+
},
|
| 89 |
+
"cell_type": "markdown",
|
| 90 |
+
"source": [
|
| 91 |
+
"## Data: model inputs"
|
| 92 |
+
]
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"metadata": {
|
| 96 |
+
"id": "3qR6e2XtW5IZ"
|
| 97 |
+
},
|
| 98 |
+
"cell_type": "markdown",
|
| 99 |
+
"source": [
|
| 100 |
+
"### Genomic interval\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"A genomic interval is specified using `genome.Interval`:"
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"metadata": {
|
| 107 |
+
"executionInfo": {
|
| 108 |
+
"elapsed": 57,
|
| 109 |
+
"status": "ok",
|
| 110 |
+
"timestamp": 1749822647220,
|
| 111 |
+
"user": {
|
| 112 |
+
"displayName": "",
|
| 113 |
+
"userId": ""
|
| 114 |
+
},
|
| 115 |
+
"user_tz": -60
|
| 116 |
+
},
|
| 117 |
+
"id": "XIZHnO32W4Hn"
|
| 118 |
+
},
|
| 119 |
+
"cell_type": "code",
|
| 120 |
+
"source": [
|
| 121 |
+
"interval = genome.Interval(chromosome='chr1', start=1_000, end=1_010)"
|
| 122 |
+
],
|
| 123 |
+
"outputs": [],
|
| 124 |
+
"execution_count": 3
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"metadata": {
|
| 128 |
+
"id": "Qn9x1ArcXLVI"
|
| 129 |
+
},
|
| 130 |
+
"cell_type": "markdown",
|
| 131 |
+
"source": [
|
| 132 |
+
"By default, these are human hg38 intervals. See the\n",
|
| 133 |
+
"[FAQ](https://www.alphagenomedocs.com/faqs.html#what-are-the-reference-genome-versions-used-by-the-model) for more\n",
|
| 134 |
+
"details on organisms and genome versions.\n"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"metadata": {
|
| 139 |
+
"id": "PCGNRUfHXOL1"
|
| 140 |
+
},
|
| 141 |
+
"cell_type": "markdown",
|
| 142 |
+
"source": [
|
| 143 |
+
"#### Interval properties\n",
|
| 144 |
+
"\n",
|
| 145 |
+
"Access some handy properties of the interval:\n"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"metadata": {
|
| 150 |
+
"executionInfo": {
|
| 151 |
+
"elapsed": 66,
|
| 152 |
+
"status": "ok",
|
| 153 |
+
"timestamp": 1749822647565,
|
| 154 |
+
"user": {
|
| 155 |
+
"displayName": "",
|
| 156 |
+
"userId": ""
|
| 157 |
+
},
|
| 158 |
+
"user_tz": -60
|
| 159 |
+
},
|
| 160 |
+
"id": "8bn73Lm3XL1C",
|
| 161 |
+
"outputId": "f872b5f0-51bd-4455-9ec6-96c3d19a5c1d"
|
| 162 |
+
},
|
| 163 |
+
"cell_type": "code",
|
| 164 |
+
"source": [
|
| 165 |
+
"interval.center()"
|
| 166 |
+
],
|
| 167 |
+
"outputs": [
|
| 168 |
+
{
|
| 169 |
+
"data": {
|
| 170 |
+
"text/plain": [
|
| 171 |
+
"1005"
|
| 172 |
+
]
|
| 173 |
+
},
|
| 174 |
+
"execution_count": 4,
|
| 175 |
+
"metadata": {},
|
| 176 |
+
"output_type": "execute_result"
|
| 177 |
+
}
|
| 178 |
+
],
|
| 179 |
+
"execution_count": 4
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"metadata": {
|
| 183 |
+
"executionInfo": {
|
| 184 |
+
"elapsed": 56,
|
| 185 |
+
"status": "ok",
|
| 186 |
+
"timestamp": 1749822647918,
|
| 187 |
+
"user": {
|
| 188 |
+
"displayName": "",
|
| 189 |
+
"userId": ""
|
| 190 |
+
},
|
| 191 |
+
"user_tz": -60
|
| 192 |
+
},
|
| 193 |
+
"id": "fJVk-ocQXWhm",
|
| 194 |
+
"outputId": "d0203696-36b6-4ca5-a204-f417284f2ca7"
|
| 195 |
+
},
|
| 196 |
+
"cell_type": "code",
|
| 197 |
+
"source": [
|
| 198 |
+
"interval.width"
|
| 199 |
+
],
|
| 200 |
+
"outputs": [
|
| 201 |
+
{
|
| 202 |
+
"data": {
|
| 203 |
+
"text/plain": [
|
| 204 |
+
"10"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
"execution_count": 5,
|
| 208 |
+
"metadata": {},
|
| 209 |
+
"output_type": "execute_result"
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"execution_count": 5
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"metadata": {
|
| 216 |
+
"id": "BvlmfYgBXXig"
|
| 217 |
+
},
|
| 218 |
+
"cell_type": "markdown",
|
| 219 |
+
"source": [
|
| 220 |
+
"#### Resize\n",
|
| 221 |
+
"\n",
|
| 222 |
+
"Use `genome.Interval.resize` to resize the interval\n",
|
| 223 |
+
"around its center point:"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"metadata": {
|
| 228 |
+
"executionInfo": {
|
| 229 |
+
"elapsed": 64,
|
| 230 |
+
"status": "ok",
|
| 231 |
+
"timestamp": 1749822648252,
|
| 232 |
+
"user": {
|
| 233 |
+
"displayName": "",
|
| 234 |
+
"userId": ""
|
| 235 |
+
},
|
| 236 |
+
"user_tz": -60
|
| 237 |
+
},
|
| 238 |
+
"id": "y72ZqANrXehY",
|
| 239 |
+
"outputId": "5c001c60-01db-4807-8508-a795a88dc629"
|
| 240 |
+
},
|
| 241 |
+
"cell_type": "code",
|
| 242 |
+
"source": [
|
| 243 |
+
"interval.resize(100)"
|
| 244 |
+
],
|
| 245 |
+
"outputs": [
|
| 246 |
+
{
|
| 247 |
+
"data": {
|
| 248 |
+
"text/plain": [
|
| 249 |
+
"Interval(chromosome='chr1', start=955, end=1055, strand='.', name='')"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
"execution_count": 6,
|
| 253 |
+
"metadata": {},
|
| 254 |
+
"output_type": "execute_result"
|
| 255 |
+
}
|
| 256 |
+
],
|
| 257 |
+
"execution_count": 6
|
| 258 |
+
},
|
| 259 |
+
{
|
| 260 |
+
"metadata": {
|
| 261 |
+
"id": "ZrM4rMJDXkNF"
|
| 262 |
+
},
|
| 263 |
+
"cell_type": "markdown",
|
| 264 |
+
"source": [
|
| 265 |
+
"#### Compare intervals\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"We can also check the interval's relationship to other intervals:"
|
| 268 |
+
]
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"metadata": {
|
| 272 |
+
"executionInfo": {
|
| 273 |
+
"elapsed": 68,
|
| 274 |
+
"status": "ok",
|
| 275 |
+
"timestamp": 1749822648622,
|
| 276 |
+
"user": {
|
| 277 |
+
"displayName": "",
|
| 278 |
+
"userId": ""
|
| 279 |
+
},
|
| 280 |
+
"user_tz": -60
|
| 281 |
+
},
|
| 282 |
+
"id": "Ye04nJETXmBL"
|
| 283 |
+
},
|
| 284 |
+
"cell_type": "code",
|
| 285 |
+
"source": [
|
| 286 |
+
"second_interval = genome.Interval(chromosome='chr1', start=1_005, end=1_015)"
|
| 287 |
+
],
|
| 288 |
+
"outputs": [],
|
| 289 |
+
"execution_count": 7
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"metadata": {
|
| 293 |
+
"executionInfo": {
|
| 294 |
+
"elapsed": 62,
|
| 295 |
+
"status": "ok",
|
| 296 |
+
"timestamp": 1749822648949,
|
| 297 |
+
"user": {
|
| 298 |
+
"displayName": "",
|
| 299 |
+
"userId": ""
|
| 300 |
+
},
|
| 301 |
+
"user_tz": -60
|
| 302 |
+
},
|
| 303 |
+
"id": "9yecEDzAXpIS",
|
| 304 |
+
"outputId": "d5a0d62c-4d96-4945-c33b-9566f5c9d1b0"
|
| 305 |
+
},
|
| 306 |
+
"cell_type": "code",
|
| 307 |
+
"source": [
|
| 308 |
+
"interval.overlaps(second_interval)"
|
| 309 |
+
],
|
| 310 |
+
"outputs": [
|
| 311 |
+
{
|
| 312 |
+
"data": {
|
| 313 |
+
"text/plain": [
|
| 314 |
+
"True"
|
| 315 |
+
]
|
| 316 |
+
},
|
| 317 |
+
"execution_count": 8,
|
| 318 |
+
"metadata": {},
|
| 319 |
+
"output_type": "execute_result"
|
| 320 |
+
}
|
| 321 |
+
],
|
| 322 |
+
"execution_count": 8
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"metadata": {
|
| 326 |
+
"executionInfo": {
|
| 327 |
+
"elapsed": 61,
|
| 328 |
+
"status": "ok",
|
| 329 |
+
"timestamp": 1749822649266,
|
| 330 |
+
"user": {
|
| 331 |
+
"displayName": "",
|
| 332 |
+
"userId": ""
|
| 333 |
+
},
|
| 334 |
+
"user_tz": -60
|
| 335 |
+
},
|
| 336 |
+
"id": "tMN-FGXZXsqr",
|
| 337 |
+
"outputId": "32d4de71-3d1c-4965-91c2-8f82b09d9aab"
|
| 338 |
+
},
|
| 339 |
+
"cell_type": "code",
|
| 340 |
+
"source": [
|
| 341 |
+
"interval.contains(second_interval)"
|
| 342 |
+
],
|
| 343 |
+
"outputs": [
|
| 344 |
+
{
|
| 345 |
+
"data": {
|
| 346 |
+
"text/plain": [
|
| 347 |
+
"False"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
"execution_count": 9,
|
| 351 |
+
"metadata": {},
|
| 352 |
+
"output_type": "execute_result"
|
| 353 |
+
}
|
| 354 |
+
],
|
| 355 |
+
"execution_count": 9
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"metadata": {
|
| 359 |
+
"executionInfo": {
|
| 360 |
+
"elapsed": 330,
|
| 361 |
+
"status": "ok",
|
| 362 |
+
"timestamp": 1749822649862,
|
| 363 |
+
"user": {
|
| 364 |
+
"displayName": "",
|
| 365 |
+
"userId": ""
|
| 366 |
+
},
|
| 367 |
+
"user_tz": -60
|
| 368 |
+
},
|
| 369 |
+
"id": "sDjWXjJYXuPB",
|
| 370 |
+
"outputId": "f43f4e1b-e319-44c7-d6d9-68b5384d579c"
|
| 371 |
+
},
|
| 372 |
+
"cell_type": "code",
|
| 373 |
+
"source": [
|
| 374 |
+
"interval.intersect(second_interval)"
|
| 375 |
+
],
|
| 376 |
+
"outputs": [
|
| 377 |
+
{
|
| 378 |
+
"data": {
|
| 379 |
+
"text/plain": [
|
| 380 |
+
"Interval(chromosome='chr1', start=1005, end=1010, strand='.', name='')"
|
| 381 |
+
]
|
| 382 |
+
},
|
| 383 |
+
"execution_count": 10,
|
| 384 |
+
"metadata": {},
|
| 385 |
+
"output_type": "execute_result"
|
| 386 |
+
}
|
| 387 |
+
],
|
| 388 |
+
"execution_count": 10
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"metadata": {
|
| 392 |
+
"id": "X0U15RKjXwZL"
|
| 393 |
+
},
|
| 394 |
+
"cell_type": "markdown",
|
| 395 |
+
"source": [
|
| 396 |
+
"As a subtle point, AlphaGenome classes use 0-based indexing, meaning that the\n",
|
| 397 |
+
"interval includes the base pair at the `start` position up to the base pair at\n",
|
| 398 |
+
"the `end-1` position. See the [FAQ](https://www.alphagenomedocs.com/faqs.html#how-do-i-specify-a-genomic-region)\n",
|
| 399 |
+
"for more on this topic."
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"metadata": {
|
| 404 |
+
"id": "dIDUCQKOX1Vj"
|
| 405 |
+
},
|
| 406 |
+
"cell_type": "markdown",
|
| 407 |
+
"source": [
|
| 408 |
+
"### Genomic variant\n",
|
| 409 |
+
"\n",
|
| 410 |
+
"A `genome.Variant` specifies a genetic variant:\n"
|
| 411 |
+
]
|
| 412 |
+
},
|
| 413 |
+
{
|
| 414 |
+
"metadata": {
|
| 415 |
+
"executionInfo": {
|
| 416 |
+
"elapsed": 54,
|
| 417 |
+
"status": "ok",
|
| 418 |
+
"timestamp": 1749822650248,
|
| 419 |
+
"user": {
|
| 420 |
+
"displayName": "",
|
| 421 |
+
"userId": ""
|
| 422 |
+
},
|
| 423 |
+
"user_tz": -60
|
| 424 |
+
},
|
| 425 |
+
"id": "R_D6AoKFXyBJ"
|
| 426 |
+
},
|
| 427 |
+
"cell_type": "code",
|
| 428 |
+
"source": [
|
| 429 |
+
"variant = genome.Variant(\n",
|
| 430 |
+
" chromosome='chr3', position=10_000, reference_bases='A', alternate_bases='C'\n",
|
| 431 |
+
")"
|
| 432 |
+
],
|
| 433 |
+
"outputs": [],
|
| 434 |
+
"execution_count": 11
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"metadata": {
|
| 438 |
+
"id": "L9QcFhogX693"
|
| 439 |
+
},
|
| 440 |
+
"cell_type": "markdown",
|
| 441 |
+
"source": [
|
| 442 |
+
"This variant changes the base `A` to a `C` at position 10\\_000 on chromosome 3\\.\n",
|
| 443 |
+
"Note that the `position` attribute is 1-based to maintain compatibility with\n",
|
| 444 |
+
"common public variant formats (see [FAQ](https://www.alphagenomedocs.com/faqs.html#how-do-i-define-a-variant) for more\n",
|
| 445 |
+
"info.)"
|
| 446 |
+
]
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"metadata": {
|
| 450 |
+
"id": "l6SRhPTrYKY3"
|
| 451 |
+
},
|
| 452 |
+
"cell_type": "markdown",
|
| 453 |
+
"source": [
|
| 454 |
+
"#### Insertions or deletions (indels)\n",
|
| 455 |
+
"\n",
|
| 456 |
+
"Variants can also be larger than a single base, such as insertions or deletions:"
|
| 457 |
+
]
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"metadata": {
|
| 461 |
+
"executionInfo": {
|
| 462 |
+
"elapsed": 56,
|
| 463 |
+
"status": "ok",
|
| 464 |
+
"timestamp": 1749822650560,
|
| 465 |
+
"user": {
|
| 466 |
+
"displayName": "",
|
| 467 |
+
"userId": ""
|
| 468 |
+
},
|
| 469 |
+
"user_tz": -60
|
| 470 |
+
},
|
| 471 |
+
"id": "PMmSYhSfX9K9"
|
| 472 |
+
},
|
| 473 |
+
"cell_type": "code",
|
| 474 |
+
"source": [
|
| 475 |
+
"# Insertion variant.\n",
|
| 476 |
+
"variant = genome.Variant(\n",
|
| 477 |
+
" chromosome='chr3',\n",
|
| 478 |
+
" position=10_000,\n",
|
| 479 |
+
" reference_bases='T',\n",
|
| 480 |
+
" alternate_bases='CGTCAAT',\n",
|
| 481 |
+
")\n",
|
| 482 |
+
"\n",
|
| 483 |
+
"# Deletion variant.\n",
|
| 484 |
+
"variant = genome.Variant(\n",
|
| 485 |
+
" chromosome='chr3',\n",
|
| 486 |
+
" position=10_000,\n",
|
| 487 |
+
" reference_bases='AGGGATC',\n",
|
| 488 |
+
" alternate_bases='C',\n",
|
| 489 |
+
")"
|
| 490 |
+
],
|
| 491 |
+
"outputs": [],
|
| 492 |
+
"execution_count": 12
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"metadata": {
|
| 496 |
+
"id": "_v9VY9kqYRoP"
|
| 497 |
+
},
|
| 498 |
+
"cell_type": "markdown",
|
| 499 |
+
"source": [
|
| 500 |
+
"The sequence we pass for the `reference_bases` argument could differ from what\n",
|
| 501 |
+
"is actually at that location in the hg38 reference genome. The model will insert\n",
|
| 502 |
+
"whatever is passed as the reference and alternate bases into the sequence and\n",
|
| 503 |
+
"make predictions on them."
|
| 504 |
+
]
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"metadata": {
|
| 508 |
+
"id": "za3OTKasYYlb"
|
| 509 |
+
},
|
| 510 |
+
"cell_type": "markdown",
|
| 511 |
+
"source": [
|
| 512 |
+
"#### Reference interval\n",
|
| 513 |
+
"\n",
|
| 514 |
+
"We can get the `genome.Interval` corresponding to the\n",
|
| 515 |
+
"reference bases of the variant using `genome.Variant.reference_interval`:"
|
| 516 |
+
]
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"metadata": {
|
| 520 |
+
"executionInfo": {
|
| 521 |
+
"elapsed": 62,
|
| 522 |
+
"status": "ok",
|
| 523 |
+
"timestamp": 1749822650873,
|
| 524 |
+
"user": {
|
| 525 |
+
"displayName": "",
|
| 526 |
+
"userId": ""
|
| 527 |
+
},
|
| 528 |
+
"user_tz": -60
|
| 529 |
+
},
|
| 530 |
+
"id": "UUyBaqWtYfc0",
|
| 531 |
+
"outputId": "cb3fc331-604d-4d58-9802-f4491dfe9bb2"
|
| 532 |
+
},
|
| 533 |
+
"cell_type": "code",
|
| 534 |
+
"source": [
|
| 535 |
+
"variant = genome.Variant(\n",
|
| 536 |
+
" chromosome='chr3', position=10_000, reference_bases='A', alternate_bases='T'\n",
|
| 537 |
+
")\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"variant.reference_interval"
|
| 540 |
+
],
|
| 541 |
+
"outputs": [
|
| 542 |
+
{
|
| 543 |
+
"data": {
|
| 544 |
+
"text/plain": [
|
| 545 |
+
"Interval(chromosome='chr3', start=9999, end=10000, strand='.', name='')"
|
| 546 |
+
]
|
| 547 |
+
},
|
| 548 |
+
"execution_count": 13,
|
| 549 |
+
"metadata": {},
|
| 550 |
+
"output_type": "execute_result"
|
| 551 |
+
}
|
| 552 |
+
],
|
| 553 |
+
"execution_count": 13
|
| 554 |
+
},
|
| 555 |
+
{
|
| 556 |
+
"metadata": {
|
| 557 |
+
"id": "DATTTBK6YiEZ"
|
| 558 |
+
},
|
| 559 |
+
"cell_type": "markdown",
|
| 560 |
+
"source": [
|
| 561 |
+
"A common use-case is to make predictions in a genome region around a variant,\n",
|
| 562 |
+
"which involves resizing the\n",
|
| 563 |
+
"`genome.Variant.reference_interval` to a sequence\n",
|
| 564 |
+
"length compatible with AlphaGenome:"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"metadata": {
|
| 569 |
+
"executionInfo": {
|
| 570 |
+
"elapsed": 81,
|
| 571 |
+
"status": "ok",
|
| 572 |
+
"timestamp": 1749822651223,
|
| 573 |
+
"user": {
|
| 574 |
+
"displayName": "",
|
| 575 |
+
"userId": ""
|
| 576 |
+
},
|
| 577 |
+
"user_tz": -60
|
| 578 |
+
},
|
| 579 |
+
"id": "YijroJwOYnfU",
|
| 580 |
+
"outputId": "f923811e-b6c8-4aca-bc5a-d821377557d3"
|
| 581 |
+
},
|
| 582 |
+
"cell_type": "code",
|
| 583 |
+
"source": [
|
| 584 |
+
"input_interval = variant.reference_interval.resize(\n",
|
| 585 |
+
" dna_client.SEQUENCE_LENGTH_1MB\n",
|
| 586 |
+
")\n",
|
| 587 |
+
"input_interval.width"
|
| 588 |
+
],
|
| 589 |
+
"outputs": [
|
| 590 |
+
{
|
| 591 |
+
"data": {
|
| 592 |
+
"text/plain": [
|
| 593 |
+
"1048576"
|
| 594 |
+
]
|
| 595 |
+
},
|
| 596 |
+
"execution_count": 14,
|
| 597 |
+
"metadata": {},
|
| 598 |
+
"output_type": "execute_result"
|
| 599 |
+
}
|
| 600 |
+
],
|
| 601 |
+
"execution_count": 14
|
| 602 |
+
},
|
| 603 |
+
{
|
| 604 |
+
"metadata": {
|
| 605 |
+
"id": "qGIpFPd_YmYc"
|
| 606 |
+
},
|
| 607 |
+
"cell_type": "markdown",
|
| 608 |
+
"source": [
|
| 609 |
+
"#### Overlap with interval\n",
|
| 610 |
+
"\n",
|
| 611 |
+
"We can also check if a variant’s reference or alternate alleles overlap an `genome.Interval`:"
|
| 612 |
+
]
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"metadata": {
|
| 616 |
+
"executionInfo": {
|
| 617 |
+
"elapsed": 128,
|
| 618 |
+
"status": "ok",
|
| 619 |
+
"timestamp": 1749822651618,
|
| 620 |
+
"user": {
|
| 621 |
+
"displayName": "",
|
| 622 |
+
"userId": ""
|
| 623 |
+
},
|
| 624 |
+
"user_tz": -60
|
| 625 |
+
},
|
| 626 |
+
"id": "HmiX9TELYxTD",
|
| 627 |
+
"outputId": "3e267c68-30dd-4aaf-e316-2a87df03d96e"
|
| 628 |
+
},
|
| 629 |
+
"cell_type": "code",
|
| 630 |
+
"source": [
|
| 631 |
+
"variant = genome.Variant(\n",
|
| 632 |
+
" chromosome='chr3',\n",
|
| 633 |
+
" position=10_000,\n",
|
| 634 |
+
" reference_bases='T',\n",
|
| 635 |
+
" alternate_bases='CGTCAAT',\n",
|
| 636 |
+
")\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"interval = genome.Interval(chromosome='chr3', start=10_005, end=10_010)\n",
|
| 639 |
+
"\n",
|
| 640 |
+
"print('Reference overlaps:', variant.reference_overlaps(interval))\n",
|
| 641 |
+
"print('Alternative overlaps:', variant.alternate_overlaps(interval))"
|
| 642 |
+
],
|
| 643 |
+
"outputs": [
|
| 644 |
+
{
|
| 645 |
+
"name": "stdout",
|
| 646 |
+
"output_type": "stream",
|
| 647 |
+
"text": [
|
| 648 |
+
"Reference overlaps: False\n",
|
| 649 |
+
"Alternative overlaps: True\n"
|
| 650 |
+
]
|
| 651 |
+
}
|
| 652 |
+
],
|
| 653 |
+
"execution_count": 15
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"metadata": {
|
| 657 |
+
"id": "Kr0329VfZAZo"
|
| 658 |
+
},
|
| 659 |
+
"cell_type": "markdown",
|
| 660 |
+
"source": [
|
| 661 |
+
"## Data: model outputs\n",
|
| 662 |
+
"\n",
|
| 663 |
+
"### Track data\n",
|
| 664 |
+
"\n",
|
| 665 |
+
"\u003ca href=\"https://services.google.com/fh/files/misc/trackdata.png\"\u003e\u003cimg src=\"https://services.google.com/fh/files/misc/trackdata.png\" alt=\"anndata\" border=\"0\" height=500\u003e\u003c/a\u003e\n",
|
| 666 |
+
"\n",
|
| 667 |
+
"`track_data.TrackData` objects store model predictions.\n",
|
| 668 |
+
"They have the following properties (using `tdata` as an example of a\n",
|
| 669 |
+
"`track_data.TrackData` object):\n",
|
| 670 |
+
"\n",
|
| 671 |
+
"* `tdata.values` store track predictions as a `numpy.ndarray` .\n",
|
| 672 |
+
"* `tdata.metadata` stores track metadata as a `pandas.DataFrame`. For\n",
|
| 673 |
+
" each track in the predicted values, there will be a corresponding row in the\n",
|
| 674 |
+
" track metadata describing its origin.\n",
|
| 675 |
+
"* `tdata.uns` contains additional unstructured metadata as a `dict`."
|
| 676 |
+
]
|
| 677 |
+
},
|
| 678 |
+
{
|
| 679 |
+
"metadata": {
|
| 680 |
+
"id": "I3HTL-NTZR6n"
|
| 681 |
+
},
|
| 682 |
+
"cell_type": "markdown",
|
| 683 |
+
"source": [
|
| 684 |
+
"#### From scratch\n",
|
| 685 |
+
"\n",
|
| 686 |
+
"You can create your own `track_data.TrackData` object\n",
|
| 687 |
+
"from scratch by specifying the values and metadata manually. The metadata must\n",
|
| 688 |
+
"contain at least the columns name (the names of the tracks) and strand (the\n",
|
| 689 |
+
"strands of DNA that the tracks are on):\n"
|
| 690 |
+
]
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"metadata": {
|
| 694 |
+
"executionInfo": {
|
| 695 |
+
"elapsed": 58,
|
| 696 |
+
"status": "ok",
|
| 697 |
+
"timestamp": 1749822651961,
|
| 698 |
+
"user": {
|
| 699 |
+
"displayName": "",
|
| 700 |
+
"userId": ""
|
| 701 |
+
},
|
| 702 |
+
"user_tz": -60
|
| 703 |
+
},
|
| 704 |
+
"id": "uwsbveEvZBbt"
|
| 705 |
+
},
|
| 706 |
+
"cell_type": "code",
|
| 707 |
+
"source": [
|
| 708 |
+
"from alphagenome.data import track_data\n",
|
| 709 |
+
"\n",
|
| 710 |
+
"# Array has shape (4,3) -\u003e sequence is length 4 and there are 3 tracks.\n",
|
| 711 |
+
"values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(\n",
|
| 712 |
+
" np.float32\n",
|
| 713 |
+
")\n",
|
| 714 |
+
"\n",
|
| 715 |
+
"# We have both the positive and negative strand values for track1, while track2\n",
|
| 716 |
+
"# contains unstranded data.\n",
|
| 717 |
+
"metadata = pd.DataFrame({\n",
|
| 718 |
+
" 'name': ['track1', 'track1', 'track2'],\n",
|
| 719 |
+
" 'strand': ['+', '-', '.'],\n",
|
| 720 |
+
"})\n",
|
| 721 |
+
"\n",
|
| 722 |
+
"tdata = track_data.TrackData(values=values, metadata=metadata)"
|
| 723 |
+
],
|
| 724 |
+
"outputs": [],
|
| 725 |
+
"execution_count": 16
|
| 726 |
+
},
|
| 727 |
+
{
|
| 728 |
+
"metadata": {
|
| 729 |
+
"id": "qEc4ioZVZgR0"
|
| 730 |
+
},
|
| 731 |
+
"cell_type": "markdown",
|
| 732 |
+
"source": [
|
| 733 |
+
"#### Resolution\n",
|
| 734 |
+
"\n",
|
| 735 |
+
"It’s also useful to specify the resolution of the tracks and the genomic\n",
|
| 736 |
+
"interval that they come from, if you have this information available:\n"
|
| 737 |
+
]
|
| 738 |
+
},
|
| 739 |
+
{
|
| 740 |
+
"metadata": {
|
| 741 |
+
"executionInfo": {
|
| 742 |
+
"elapsed": 55,
|
| 743 |
+
"status": "ok",
|
| 744 |
+
"timestamp": 1749822652304,
|
| 745 |
+
"user": {
|
| 746 |
+
"displayName": "",
|
| 747 |
+
"userId": ""
|
| 748 |
+
},
|
| 749 |
+
"user_tz": -60
|
| 750 |
+
},
|
| 751 |
+
"id": "JGnuY6g_ZaBL"
|
| 752 |
+
},
|
| 753 |
+
"cell_type": "code",
|
| 754 |
+
"source": [
|
| 755 |
+
"interval = genome.Interval(chromosome='chr1', start=1_000, end=1_004)\n",
|
| 756 |
+
"\n",
|
| 757 |
+
"tdata = track_data.TrackData(\n",
|
| 758 |
+
" values=values, metadata=metadata, resolution=1, interval=interval\n",
|
| 759 |
+
")"
|
| 760 |
+
],
|
| 761 |
+
"outputs": [],
|
| 762 |
+
"execution_count": 17
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"metadata": {
|
| 766 |
+
"id": "QeN8D_yGZlVB"
|
| 767 |
+
},
|
| 768 |
+
"cell_type": "markdown",
|
| 769 |
+
"source": [
|
| 770 |
+
"Note that the length of the values has to match up with the interval width and\n",
|
| 771 |
+
"resolution. Here is an example specifying that the values actually represent\n",
|
| 772 |
+
"128bp resolution tracks (i.e., each number is a summary over 128 base pairs of\n",
|
| 773 |
+
"DNA):\n"
|
| 774 |
+
]
|
| 775 |
+
},
|
| 776 |
+
{
|
| 777 |
+
"metadata": {
|
| 778 |
+
"executionInfo": {
|
| 779 |
+
"elapsed": 56,
|
| 780 |
+
"status": "ok",
|
| 781 |
+
"timestamp": 1749822652621,
|
| 782 |
+
"user": {
|
| 783 |
+
"displayName": "",
|
| 784 |
+
"userId": ""
|
| 785 |
+
},
|
| 786 |
+
"user_tz": -60
|
| 787 |
+
},
|
| 788 |
+
"id": "I-iPD_ZGZjMI"
|
| 789 |
+
},
|
| 790 |
+
"cell_type": "code",
|
| 791 |
+
"source": [
|
| 792 |
+
"interval = genome.Interval(chromosome='chr1', start=1_000, end=1_512)\n",
|
| 793 |
+
"\n",
|
| 794 |
+
"tdata = track_data.TrackData(\n",
|
| 795 |
+
" values=values, metadata=metadata, resolution=128, interval=interval\n",
|
| 796 |
+
")"
|
| 797 |
+
],
|
| 798 |
+
"outputs": [],
|
| 799 |
+
"execution_count": 18
|
| 800 |
+
},
|
| 801 |
+
{
|
| 802 |
+
"metadata": {
|
| 803 |
+
"id": "HhVG6w41ZqEp"
|
| 804 |
+
},
|
| 805 |
+
"cell_type": "markdown",
|
| 806 |
+
"source": [
|
| 807 |
+
"#### Converting between resolutions\n",
|
| 808 |
+
"\n",
|
| 809 |
+
"We can also interconvert between resolutions. For example, given 1bp resolution\n",
|
| 810 |
+
"predictions, we can downsample the resolution (by summing adjacent values) and\n",
|
| 811 |
+
"return a sequence of length 2:"
|
| 812 |
+
]
|
| 813 |
+
},
|
| 814 |
+
{
|
| 815 |
+
"metadata": {
|
| 816 |
+
"executionInfo": {
|
| 817 |
+
"elapsed": 64,
|
| 818 |
+
"status": "ok",
|
| 819 |
+
"timestamp": 1749822652955,
|
| 820 |
+
"user": {
|
| 821 |
+
"displayName": "",
|
| 822 |
+
"userId": ""
|
| 823 |
+
},
|
| 824 |
+
"user_tz": -60
|
| 825 |
+
},
|
| 826 |
+
"id": "Ce41uPkcZoWd",
|
| 827 |
+
"outputId": "38adb4cb-7a38-4cc3-8f12-af6ea95dfc0b"
|
| 828 |
+
},
|
| 829 |
+
"cell_type": "code",
|
| 830 |
+
"source": [
|
| 831 |
+
"interval = genome.Interval(chromosome='chr1', start=1_000, end=1_004)\n",
|
| 832 |
+
"\n",
|
| 833 |
+
"tdata = track_data.TrackData(\n",
|
| 834 |
+
" values=values, metadata=metadata, resolution=1, interval=interval\n",
|
| 835 |
+
")\n",
|
| 836 |
+
"\n",
|
| 837 |
+
"tdata = tdata.change_resolution(resolution=2)\n",
|
| 838 |
+
"tdata.values"
|
| 839 |
+
],
|
| 840 |
+
"outputs": [
|
| 841 |
+
{
|
| 842 |
+
"data": {
|
| 843 |
+
"text/plain": [
|
| 844 |
+
"array([[ 3., 5., 7.],\n",
|
| 845 |
+
" [15., 17., 19.]], dtype=float32)"
|
| 846 |
+
]
|
| 847 |
+
},
|
| 848 |
+
"execution_count": 19,
|
| 849 |
+
"metadata": {},
|
| 850 |
+
"output_type": "execute_result"
|
| 851 |
+
}
|
| 852 |
+
],
|
| 853 |
+
"execution_count": 19
|
| 854 |
+
},
|
| 855 |
+
{
|
| 856 |
+
"metadata": {
|
| 857 |
+
"id": "23Ehe3duZqC3"
|
| 858 |
+
},
|
| 859 |
+
"cell_type": "markdown",
|
| 860 |
+
"source": [
|
| 861 |
+
"We can also upsample track data to get back to 1bp resolution and a sequence of\n",
|
| 862 |
+
"length 4 by repeating values while preserving the sum:"
|
| 863 |
+
]
|
| 864 |
+
},
|
| 865 |
+
{
|
| 866 |
+
"metadata": {
|
| 867 |
+
"executionInfo": {
|
| 868 |
+
"elapsed": 65,
|
| 869 |
+
"status": "ok",
|
| 870 |
+
"timestamp": 1749822653280,
|
| 871 |
+
"user": {
|
| 872 |
+
"displayName": "",
|
| 873 |
+
"userId": ""
|
| 874 |
+
},
|
| 875 |
+
"user_tz": -60
|
| 876 |
+
},
|
| 877 |
+
"id": "dDi9OWWWZzq_",
|
| 878 |
+
"outputId": "51f3878b-d71e-48f7-f6c7-7a87830704ed"
|
| 879 |
+
},
|
| 880 |
+
"cell_type": "code",
|
| 881 |
+
"source": [
|
| 882 |
+
"tdata = tdata.change_resolution(resolution=1)\n",
|
| 883 |
+
"tdata.values"
|
| 884 |
+
],
|
| 885 |
+
"outputs": [
|
| 886 |
+
{
|
| 887 |
+
"data": {
|
| 888 |
+
"text/plain": [
|
| 889 |
+
"array([[1.5, 2.5, 3.5],\n",
|
| 890 |
+
" [1.5, 2.5, 3.5],\n",
|
| 891 |
+
" [7.5, 8.5, 9.5],\n",
|
| 892 |
+
" [7.5, 8.5, 9.5]], dtype=float32)"
|
| 893 |
+
]
|
| 894 |
+
},
|
| 895 |
+
"execution_count": 20,
|
| 896 |
+
"metadata": {},
|
| 897 |
+
"output_type": "execute_result"
|
| 898 |
+
}
|
| 899 |
+
],
|
| 900 |
+
"execution_count": 20
|
| 901 |
+
},
|
| 902 |
+
{
|
| 903 |
+
"metadata": {
|
| 904 |
+
"id": "Rkne2GEwZ82L"
|
| 905 |
+
},
|
| 906 |
+
"cell_type": "markdown",
|
| 907 |
+
"source": [
|
| 908 |
+
"#### Filtering\n",
|
| 909 |
+
"\n",
|
| 910 |
+
"`track_data.TrackData` objects can be filtered by the\n",
|
| 911 |
+
"type of DNA strand the tracks are on:\n"
|
| 912 |
+
]
|
| 913 |
+
},
|
| 914 |
+
{
|
| 915 |
+
"metadata": {
|
| 916 |
+
"executionInfo": {
|
| 917 |
+
"elapsed": 71,
|
| 918 |
+
"status": "ok",
|
| 919 |
+
"timestamp": 1749822653638,
|
| 920 |
+
"user": {
|
| 921 |
+
"displayName": "",
|
| 922 |
+
"userId": ""
|
| 923 |
+
},
|
| 924 |
+
"user_tz": -60
|
| 925 |
+
},
|
| 926 |
+
"id": "RoIaGXLtaBbz",
|
| 927 |
+
"outputId": "75db5fc1-a5a6-4798-e26f-9d62443abbeb"
|
| 928 |
+
},
|
| 929 |
+
"cell_type": "code",
|
| 930 |
+
"source": [
|
| 931 |
+
"print(\n",
|
| 932 |
+
" 'Positive strand tracks:',\n",
|
| 933 |
+
" tdata.filter_to_positive_strand().metadata.name.values,\n",
|
| 934 |
+
")\n",
|
| 935 |
+
"print(\n",
|
| 936 |
+
" 'Negative strand tracks:',\n",
|
| 937 |
+
" tdata.filter_to_negative_strand().metadata.name.values,\n",
|
| 938 |
+
")\n",
|
| 939 |
+
"print('Unstranded tracks:', tdata.filter_to_unstranded().metadata.name.values)"
|
| 940 |
+
],
|
| 941 |
+
"outputs": [
|
| 942 |
+
{
|
| 943 |
+
"name": "stdout",
|
| 944 |
+
"output_type": "stream",
|
| 945 |
+
"text": [
|
| 946 |
+
"Positive strand tracks: ['track1']\n",
|
| 947 |
+
"Negative strand tracks: ['track1']\n",
|
| 948 |
+
"Unstranded tracks: ['track2']\n"
|
| 949 |
+
]
|
| 950 |
+
}
|
| 951 |
+
],
|
| 952 |
+
"execution_count": 21
|
| 953 |
+
},
|
| 954 |
+
{
|
| 955 |
+
"metadata": {
|
| 956 |
+
"id": "xogO-bWVaatH"
|
| 957 |
+
},
|
| 958 |
+
"cell_type": "markdown",
|
| 959 |
+
"source": [
|
| 960 |
+
"#### Resizing\n",
|
| 961 |
+
"\n",
|
| 962 |
+
"We can resize the `track_data.TrackData` to be either\n",
|
| 963 |
+
"smaller (by cropping):"
|
| 964 |
+
]
|
| 965 |
+
},
|
| 966 |
+
{
|
| 967 |
+
"metadata": {
|
| 968 |
+
"executionInfo": {
|
| 969 |
+
"elapsed": 61,
|
| 970 |
+
"status": "ok",
|
| 971 |
+
"timestamp": 1749822654145,
|
| 972 |
+
"user": {
|
| 973 |
+
"displayName": "",
|
| 974 |
+
"userId": ""
|
| 975 |
+
},
|
| 976 |
+
"user_tz": -60
|
| 977 |
+
},
|
| 978 |
+
"id": "clj536M9abBp",
|
| 979 |
+
"outputId": "f7819f36-70c7-4b78-cb04-b0e2e030ff6e"
|
| 980 |
+
},
|
| 981 |
+
"cell_type": "code",
|
| 982 |
+
"source": [
|
| 983 |
+
"# Re-instantiating the original trackdata.\n",
|
| 984 |
+
"tdata = track_data.TrackData(\n",
|
| 985 |
+
" values=values, metadata=metadata, resolution=1, interval=interval\n",
|
| 986 |
+
")\n",
|
| 987 |
+
"\n",
|
| 988 |
+
"# Resize from width (sequence length) of 4 down to 2.\n",
|
| 989 |
+
"tdata.resize(width=2).values"
|
| 990 |
+
],
|
| 991 |
+
"outputs": [
|
| 992 |
+
{
|
| 993 |
+
"data": {
|
| 994 |
+
"text/plain": [
|
| 995 |
+
"array([[3., 4., 5.],\n",
|
| 996 |
+
" [6., 7., 8.]], dtype=float32)"
|
| 997 |
+
]
|
| 998 |
+
},
|
| 999 |
+
"execution_count": 22,
|
| 1000 |
+
"metadata": {},
|
| 1001 |
+
"output_type": "execute_result"
|
| 1002 |
+
}
|
| 1003 |
+
],
|
| 1004 |
+
"execution_count": 22
|
| 1005 |
+
},
|
| 1006 |
+
{
|
| 1007 |
+
"metadata": {
|
| 1008 |
+
"id": "wl3aNKu-akaO"
|
| 1009 |
+
},
|
| 1010 |
+
"cell_type": "markdown",
|
| 1011 |
+
"source": [
|
| 1012 |
+
"Or bigger (by padding with zeros):\n"
|
| 1013 |
+
]
|
| 1014 |
+
},
|
| 1015 |
+
{
|
| 1016 |
+
"metadata": {
|
| 1017 |
+
"executionInfo": {
|
| 1018 |
+
"elapsed": 61,
|
| 1019 |
+
"status": "ok",
|
| 1020 |
+
"timestamp": 1749822654456,
|
| 1021 |
+
"user": {
|
| 1022 |
+
"displayName": "",
|
| 1023 |
+
"userId": ""
|
| 1024 |
+
},
|
| 1025 |
+
"user_tz": -60
|
| 1026 |
+
},
|
| 1027 |
+
"id": "pHWZRppBaeC1",
|
| 1028 |
+
"outputId": "087a212d-ca6f-4490-fe59-408e9a157ee2"
|
| 1029 |
+
},
|
| 1030 |
+
"cell_type": "code",
|
| 1031 |
+
"source": [
|
| 1032 |
+
"tdata.resize(width=8).values"
|
| 1033 |
+
],
|
| 1034 |
+
"outputs": [
|
| 1035 |
+
{
|
| 1036 |
+
"data": {
|
| 1037 |
+
"text/plain": [
|
| 1038 |
+
"array([[ 0., 0., 0.],\n",
|
| 1039 |
+
" [ 0., 0., 0.],\n",
|
| 1040 |
+
" [ 0., 1., 2.],\n",
|
| 1041 |
+
" [ 3., 4., 5.],\n",
|
| 1042 |
+
" [ 6., 7., 8.],\n",
|
| 1043 |
+
" [ 9., 10., 11.],\n",
|
| 1044 |
+
" [ 0., 0., 0.],\n",
|
| 1045 |
+
" [ 0., 0., 0.]], dtype=float32)"
|
| 1046 |
+
]
|
| 1047 |
+
},
|
| 1048 |
+
"execution_count": 23,
|
| 1049 |
+
"metadata": {},
|
| 1050 |
+
"output_type": "execute_result"
|
| 1051 |
+
}
|
| 1052 |
+
],
|
| 1053 |
+
"execution_count": 23
|
| 1054 |
+
},
|
| 1055 |
+
{
|
| 1056 |
+
"metadata": {
|
| 1057 |
+
"id": "SRkggBayar2r"
|
| 1058 |
+
},
|
| 1059 |
+
"cell_type": "markdown",
|
| 1060 |
+
"source": [
|
| 1061 |
+
"#### Slicing\n",
|
| 1062 |
+
"\n",
|
| 1063 |
+
"We can slice into specific positions of the\n",
|
| 1064 |
+
"`track_data.TrackData`:"
|
| 1065 |
+
]
|
| 1066 |
+
},
|
| 1067 |
+
{
|
| 1068 |
+
"metadata": {
|
| 1069 |
+
"executionInfo": {
|
| 1070 |
+
"elapsed": 56,
|
| 1071 |
+
"status": "ok",
|
| 1072 |
+
"timestamp": 1749822654784,
|
| 1073 |
+
"user": {
|
| 1074 |
+
"displayName": "",
|
| 1075 |
+
"userId": ""
|
| 1076 |
+
},
|
| 1077 |
+
"user_tz": -60
|
| 1078 |
+
},
|
| 1079 |
+
"id": "yHcMdzN1amMz",
|
| 1080 |
+
"outputId": "73a293d2-279c-4709-b79c-cfd24235e19d"
|
| 1081 |
+
},
|
| 1082 |
+
"cell_type": "code",
|
| 1083 |
+
"source": [
|
| 1084 |
+
"# Get the final 2 positions only.\n",
|
| 1085 |
+
"print('slice by position: ', tdata.slice_by_positions(start=2, end=4).values)\n",
|
| 1086 |
+
"# Same, but using slice_interval:\n",
|
| 1087 |
+
"print(\n",
|
| 1088 |
+
" 'slice by interval: ',\n",
|
| 1089 |
+
" tdata.slice_by_interval(\n",
|
| 1090 |
+
" genome.Interval(chromosome='chr1', start=1_002, end=1_004)\n",
|
| 1091 |
+
" ).values,\n",
|
| 1092 |
+
")"
|
| 1093 |
+
],
|
| 1094 |
+
"outputs": [
|
| 1095 |
+
{
|
| 1096 |
+
"name": "stdout",
|
| 1097 |
+
"output_type": "stream",
|
| 1098 |
+
"text": [
|
| 1099 |
+
"slice by position: [[ 6. 7. 8.]\n",
|
| 1100 |
+
" [ 9. 10. 11.]]\n",
|
| 1101 |
+
"slice by interval: [[ 6. 7. 8.]\n",
|
| 1102 |
+
" [ 9. 10. 11.]]\n"
|
| 1103 |
+
]
|
| 1104 |
+
}
|
| 1105 |
+
],
|
| 1106 |
+
"execution_count": 24
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"metadata": {
|
| 1110 |
+
"id": "p1OiFpXGa-Ja"
|
| 1111 |
+
},
|
| 1112 |
+
"cell_type": "markdown",
|
| 1113 |
+
"source": [
|
| 1114 |
+
"#### Subsetting tracks\n",
|
| 1115 |
+
"\n",
|
| 1116 |
+
"Subset (and reorder) to specific track names:"
|
| 1117 |
+
]
|
| 1118 |
+
},
|
| 1119 |
+
{
|
| 1120 |
+
"metadata": {
|
| 1121 |
+
"executionInfo": {
|
| 1122 |
+
"elapsed": 63,
|
| 1123 |
+
"status": "ok",
|
| 1124 |
+
"timestamp": 1749822655084,
|
| 1125 |
+
"user": {
|
| 1126 |
+
"displayName": "",
|
| 1127 |
+
"userId": ""
|
| 1128 |
+
},
|
| 1129 |
+
"user_tz": -60
|
| 1130 |
+
},
|
| 1131 |
+
"id": "_l9bKsuvaxtu",
|
| 1132 |
+
"outputId": "abed781e-42d7-4c55-85fe-754ac8351905"
|
| 1133 |
+
},
|
| 1134 |
+
"cell_type": "code",
|
| 1135 |
+
"source": [
|
| 1136 |
+
"# Get only tracks with the name 'track1'.\n",
|
| 1137 |
+
"track1_tdata = tdata.select_tracks_by_name(names='track1')\n",
|
| 1138 |
+
"track1_tdata.values"
|
| 1139 |
+
],
|
| 1140 |
+
"outputs": [
|
| 1141 |
+
{
|
| 1142 |
+
"data": {
|
| 1143 |
+
"text/plain": [
|
| 1144 |
+
"array([[ 0., 1.],\n",
|
| 1145 |
+
" [ 3., 4.],\n",
|
| 1146 |
+
" [ 6., 7.],\n",
|
| 1147 |
+
" [ 9., 10.]], dtype=float32)"
|
| 1148 |
+
]
|
| 1149 |
+
},
|
| 1150 |
+
"execution_count": 25,
|
| 1151 |
+
"metadata": {},
|
| 1152 |
+
"output_type": "execute_result"
|
| 1153 |
+
}
|
| 1154 |
+
],
|
| 1155 |
+
"execution_count": 25
|
| 1156 |
+
},
|
| 1157 |
+
{
|
| 1158 |
+
"metadata": {
|
| 1159 |
+
"id": "bdTrjm4pbLIC"
|
| 1160 |
+
},
|
| 1161 |
+
"cell_type": "markdown",
|
| 1162 |
+
"source": [
|
| 1163 |
+
"The metadata gets automatically filtered to `track1` too:\n"
|
| 1164 |
+
]
|
| 1165 |
+
},
|
| 1166 |
+
{
|
| 1167 |
+
"metadata": {
|
| 1168 |
+
"executionInfo": {
|
| 1169 |
+
"elapsed": 58,
|
| 1170 |
+
"status": "ok",
|
| 1171 |
+
"timestamp": 1749822655417,
|
| 1172 |
+
"user": {
|
| 1173 |
+
"displayName": "",
|
| 1174 |
+
"userId": ""
|
| 1175 |
+
},
|
| 1176 |
+
"user_tz": -60
|
| 1177 |
+
},
|
| 1178 |
+
"id": "aErYz_nRbCID",
|
| 1179 |
+
"outputId": "59679dd0-7f08-44d9-dfb8-3f5a34e7380c"
|
| 1180 |
+
},
|
| 1181 |
+
"cell_type": "code",
|
| 1182 |
+
"source": [
|
| 1183 |
+
"track1_tdata.metadata.name.values"
|
| 1184 |
+
],
|
| 1185 |
+
"outputs": [
|
| 1186 |
+
{
|
| 1187 |
+
"data": {
|
| 1188 |
+
"text/plain": [
|
| 1189 |
+
"array(['track1', 'track1'], dtype=object)"
|
| 1190 |
+
]
|
| 1191 |
+
},
|
| 1192 |
+
"execution_count": 26,
|
| 1193 |
+
"metadata": {},
|
| 1194 |
+
"output_type": "execute_result"
|
| 1195 |
+
}
|
| 1196 |
+
],
|
| 1197 |
+
"execution_count": 26
|
| 1198 |
+
},
|
| 1199 |
+
{
|
| 1200 |
+
"metadata": {
|
| 1201 |
+
"id": "lgprEnpIbPtR"
|
| 1202 |
+
},
|
| 1203 |
+
"cell_type": "markdown",
|
| 1204 |
+
"source": [
|
| 1205 |
+
"Finally, if we pass in a stranded `genome.Interval` or\n",
|
| 1206 |
+
"leave unspecified as `None` when constructing a\n",
|
| 1207 |
+
"`track_data.TrackData`, we can reverse complement\n",
|
| 1208 |
+
"transform our track values in a strand-aware manner:\n"
|
| 1209 |
+
]
|
| 1210 |
+
},
|
| 1211 |
+
{
|
| 1212 |
+
"metadata": {
|
| 1213 |
+
"executionInfo": {
|
| 1214 |
+
"elapsed": 58,
|
| 1215 |
+
"status": "ok",
|
| 1216 |
+
"timestamp": 1749822655727,
|
| 1217 |
+
"user": {
|
| 1218 |
+
"displayName": "",
|
| 1219 |
+
"userId": ""
|
| 1220 |
+
},
|
| 1221 |
+
"user_tz": -60
|
| 1222 |
+
},
|
| 1223 |
+
"id": "HeZXB4K3bMyJ",
|
| 1224 |
+
"outputId": "1d6250e5-f1c3-471a-b347-c73664d056a5"
|
| 1225 |
+
},
|
| 1226 |
+
"cell_type": "code",
|
| 1227 |
+
"source": [
|
| 1228 |
+
"interval = genome.Interval(\n",
|
| 1229 |
+
" chromosome='chr1', start=1_000, end=1_004, strand='+'\n",
|
| 1230 |
+
")\n",
|
| 1231 |
+
"\n",
|
| 1232 |
+
"tdata = track_data.TrackData(\n",
|
| 1233 |
+
" values=values, metadata=metadata, resolution=1, interval=interval\n",
|
| 1234 |
+
")\n",
|
| 1235 |
+
"\n",
|
| 1236 |
+
"tdata.reverse_complement().values"
|
| 1237 |
+
],
|
| 1238 |
+
"outputs": [
|
| 1239 |
+
{
|
| 1240 |
+
"data": {
|
| 1241 |
+
"text/plain": [
|
| 1242 |
+
"array([[10., 9., 11.],\n",
|
| 1243 |
+
" [ 7., 6., 8.],\n",
|
| 1244 |
+
" [ 4., 3., 5.],\n",
|
| 1245 |
+
" [ 1., 0., 2.]], dtype=float32)"
|
| 1246 |
+
]
|
| 1247 |
+
},
|
| 1248 |
+
"execution_count": 27,
|
| 1249 |
+
"metadata": {},
|
| 1250 |
+
"output_type": "execute_result"
|
| 1251 |
+
}
|
| 1252 |
+
],
|
| 1253 |
+
"execution_count": 27
|
| 1254 |
+
},
|
| 1255 |
+
{
|
| 1256 |
+
"metadata": {
|
| 1257 |
+
"id": "Hb8duvxibcXW"
|
| 1258 |
+
},
|
| 1259 |
+
"cell_type": "markdown",
|
| 1260 |
+
"source": [
|
| 1261 |
+
"### Variant scoring output\n",
|
| 1262 |
+
"\n",
|
| 1263 |
+
"# \u003ca href=\"https://services.google.com/fh/files/misc/anndata.png\"\u003e\u003cimg src=\"https://services.google.com/fh/files/misc/anndata.png\" alt=\"anndata\" border=\"0\" height=500\u003e\u003c/a\u003e\n",
|
| 1264 |
+
"\n",
|
| 1265 |
+
"The output of variant scoring is in `anndata.AnnData` format, which is a\n",
|
| 1266 |
+
"way of scoring data together with annotation metadata. Originally developed in\n",
|
| 1267 |
+
"the single-cell RNA-seq field, `anndata.AnnData` is useful when you have\n",
|
| 1268 |
+
"metadata associated with an array of data.\n",
|
| 1269 |
+
"\n",
|
| 1270 |
+
"`anndata.AnnData` objects have the following properties (using\n",
|
| 1271 |
+
"`variant_scores` as an example `anndata.AnnData` object):\n",
|
| 1272 |
+
"\n",
|
| 1273 |
+
"* `variant_scores.X` contains a `numpy.ndarray` containing the variant\n",
|
| 1274 |
+
" scores per each gene in the region. This matrix has shape (`num_genes`,\n",
|
| 1275 |
+
" `num_tracks`), where `num_tracks` is the number of output tracks in your\n",
|
| 1276 |
+
" requested OutputType (such as `RNA_SEQ`, `DNASE`, etc.). Note that if you\n",
|
| 1277 |
+
" did not use a gene-centric scorer, then `variant_scores.X` will have shape\n",
|
| 1278 |
+
" (1, `num_tracks`), reflecting the fact that the variant has a single global\n",
|
| 1279 |
+
" score and not per-gene score.\n",
|
| 1280 |
+
"* `variant_scores.var` contains the track metadata as a\n",
|
| 1281 |
+
" `pandas.DataFrame`. For every track in the scores (`num_genes`,\n",
|
| 1282 |
+
" `num_tracks`), there will be a row in the track metadata explaining the\n",
|
| 1283 |
+
" track (its cell type, strand, etc.).\n",
|
| 1284 |
+
"* `variant_scores.obs` contains the gene metadata as a\n",
|
| 1285 |
+
" `pandas.DataFrame`. Note that the gene metadata is None in the case\n",
|
| 1286 |
+
" of non gene-centric variant scorers.\n",
|
| 1287 |
+
"* `variant_scores.uns` contains some additional unstructured metadata that\n",
|
| 1288 |
+
" logs the origin of the variant scores, namely:\n",
|
| 1289 |
+
" * The `genome.Variant` that was scored\n",
|
| 1290 |
+
" (variant\\_scores.uns\\[‘variant’\\])\n",
|
| 1291 |
+
" * The `genome.Interval` containing the interval\n",
|
| 1292 |
+
" (variant\\_scores.uns\\[‘interval’\\])\n",
|
| 1293 |
+
" * The [`variant scorer`](https://www.alphagenomedocs.com/api/models.html#variant-scorers) that was used to\n",
|
| 1294 |
+
" generate the scores (variant\\_scores.uns\\[‘scorer’\\])"
|
| 1295 |
+
]
|
| 1296 |
+
},
|
| 1297 |
+
{
|
| 1298 |
+
"metadata": {
|
| 1299 |
+
"id": "wxew4DE0bmLO"
|
| 1300 |
+
},
|
| 1301 |
+
"cell_type": "markdown",
|
| 1302 |
+
"source": [
|
| 1303 |
+
"#### From scratch\n",
|
| 1304 |
+
"\n",
|
| 1305 |
+
"You are unlikely to need to create an `anndata.AnnData` object from\n",
|
| 1306 |
+
"scratch, but just for reference, here is how it would be done:"
|
| 1307 |
+
]
|
| 1308 |
+
},
|
| 1309 |
+
{
|
| 1310 |
+
"metadata": {
|
| 1311 |
+
"id": "e3jeE3HVbpof"
|
| 1312 |
+
},
|
| 1313 |
+
"cell_type": "code",
|
| 1314 |
+
"source": [
|
| 1315 |
+
"import anndata\n",
|
| 1316 |
+
"import numpy as np\n",
|
| 1317 |
+
"import pandas as pd\n",
|
| 1318 |
+
"\n",
|
| 1319 |
+
"# Creating a small matrix of variant scores (3 genes x 2 tracks).\n",
|
| 1320 |
+
"scores = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])\n",
|
| 1321 |
+
"\n",
|
| 1322 |
+
"gene_metadata = pd.DataFrame({'gene_id': ['ENSG0001', 'ENSG0002', 'ENSG0003']})\n",
|
| 1323 |
+
"\n",
|
| 1324 |
+
"track_metadata = pd.DataFrame(\n",
|
| 1325 |
+
" {'name': ['track1', 'track2'], 'strand': ['+', '-']}\n",
|
| 1326 |
+
")\n",
|
| 1327 |
+
"\n",
|
| 1328 |
+
"variant_scores = anndata.AnnData(\n",
|
| 1329 |
+
" X=scores, obs=gene_metadata, var=track_metadata\n",
|
| 1330 |
+
")"
|
| 1331 |
+
],
|
| 1332 |
+
"outputs": [],
|
| 1333 |
+
"execution_count": null
|
| 1334 |
+
},
|
| 1335 |
+
{
|
| 1336 |
+
"metadata": {
|
| 1337 |
+
"id": "7IHNdc_BbttW"
|
| 1338 |
+
},
|
| 1339 |
+
"cell_type": "markdown",
|
| 1340 |
+
"source": [
|
| 1341 |
+
"## Methods: making predictions\n",
|
| 1342 |
+
"\n",
|
| 1343 |
+
"The main commands for making model predictions are:\n",
|
| 1344 |
+
"\n",
|
| 1345 |
+
"* `dna_client.DnaClient.predict_sequence` to predict\n",
|
| 1346 |
+
" from a raw DNA string\n",
|
| 1347 |
+
"* `dna_client.DnaClient.predict_interval` to predict\n",
|
| 1348 |
+
" from a genome interval (a `genome.Interval`)\n",
|
| 1349 |
+
"* `dna_client.DnaClient.predict_variant` to make\n",
|
| 1350 |
+
" predictions for ref and alt sequences of a variant (a\n",
|
| 1351 |
+
" `genome.Variant` object)\n",
|
| 1352 |
+
"* `dna_client.DnaClient.score_variant` to score the\n",
|
| 1353 |
+
" effects of a variant by comparing ref and alt predictions.\n",
|
| 1354 |
+
"* `dna_client.DnaClient.score_variants` the same as\n",
|
| 1355 |
+
" the above, but for scoring a list of multiple variants.\n"
|
| 1356 |
+
]
|
| 1357 |
+
},
|
| 1358 |
+
{
|
| 1359 |
+
"metadata": {
|
| 1360 |
+
"id": "lJ17YLKRQ6nq"
|
| 1361 |
+
},
|
| 1362 |
+
"cell_type": "code",
|
| 1363 |
+
"source": [],
|
| 1364 |
+
"outputs": [],
|
| 1365 |
+
"execution_count": null
|
| 1366 |
+
},
|
| 1367 |
+
{
|
| 1368 |
+
"metadata": {
|
| 1369 |
+
"id": "X10M3ojgbw4a"
|
| 1370 |
+
},
|
| 1371 |
+
"cell_type": "markdown",
|
| 1372 |
+
"source": [
|
| 1373 |
+
"## Methods: visualization\n",
|
| 1374 |
+
"\n",
|
| 1375 |
+
"The main command for visualizing model predictions is:\n",
|
| 1376 |
+
"\n",
|
| 1377 |
+
"* `alphagenome.visualization.plot_components.plot`, to turn a list of\n",
|
| 1378 |
+
" of [plot components](https://www.alphagenomedocs.com/api/visualization.html#plot-components) into a\n",
|
| 1379 |
+
" `matplotlib.figure.Figure`.\n",
|
| 1380 |
+
"\n",
|
| 1381 |
+
"See the [visualization basics guide](https://www.alphagenomedocs.com/visualization_library_basics.html) and [visualizing predictions tutorial](https://www.alphagenomedocs.com/colabs/visualization_modality_tour.html) for more details."
|
| 1382 |
+
]
|
| 1383 |
+
}
|
| 1384 |
+
],
|
| 1385 |
+
"metadata": {
|
| 1386 |
+
"colab": {
|
| 1387 |
+
"last_runtime": {},
|
| 1388 |
+
"provenance": [
|
| 1389 |
+
{
|
| 1390 |
+
"file_id": "1hJ2uMZ3sA8pu_UvSNikENLECC-X5XrR8",
|
| 1391 |
+
"timestamp": 1749822158925
|
| 1392 |
+
}
|
| 1393 |
+
]
|
| 1394 |
+
},
|
| 1395 |
+
"kernelspec": {
|
| 1396 |
+
"display_name": "Python 3",
|
| 1397 |
+
"name": "python3"
|
| 1398 |
+
},
|
| 1399 |
+
"language_info": {
|
| 1400 |
+
"name": "python"
|
| 1401 |
+
}
|
| 1402 |
+
},
|
| 1403 |
+
"nbformat": 4,
|
| 1404 |
+
"nbformat_minor": 0
|
| 1405 |
+
}
|
alphagenome/source/colabs/example_analysis_workflow.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
alphagenome/source/colabs/quick_start.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
alphagenome/source/colabs/tissue_ontology_mapping.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
alphagenome/source/colabs/visualization_modality_tour.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
alphagenome/source/conftest.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# https://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Configure FLAGS with default values for absltest."""
|
| 16 |
+
|
| 17 |
+
from absl import app
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
app.run(lambda argv: None)
|
| 21 |
+
except SystemExit:
|
| 22 |
+
pass
|
alphagenome/source/docs/Makefile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal makefile for Sphinx documentation
|
| 2 |
+
#
|
| 3 |
+
|
| 4 |
+
# You can set these variables from the command line, and also
|
| 5 |
+
# from the environment for the first two.
|
| 6 |
+
SPHINXOPTS ?=
|
| 7 |
+
SPHINXBUILD ?= sphinx-build
|
| 8 |
+
SOURCEDIR = source
|
| 9 |
+
BUILDDIR = build
|
| 10 |
+
|
| 11 |
+
# Put it first so that "make" without argument is like "make help".
|
| 12 |
+
help:
|
| 13 |
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
| 14 |
+
|
| 15 |
+
.PHONY: help Makefile
|
| 16 |
+
|
| 17 |
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
| 18 |
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
| 19 |
+
%: Makefile
|
| 20 |
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
alphagenome/source/docs/README.md
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
AlphaGenome API is a service that provides comprehensive and accurate AI
|
| 2 |
+
predictions for genome interpretation.
|
| 3 |
+
|
| 4 |
+
AlphaGenome is a deep learning genomics model that takes a genomic (DNA)
|
| 5 |
+
sequence as input and predicts various molecular properties of DNA & RNA, many
|
| 6 |
+
at single base pair resolution.
|
alphagenome/source/docs/make.bat
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@rem Copyright 2024 Google LLC.
|
| 2 |
+
@rem
|
| 3 |
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
@rem you may not use this file except in compliance with the License.
|
| 5 |
+
@rem You may obtain a copy of the License at
|
| 6 |
+
@rem
|
| 7 |
+
@rem http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
@rem
|
| 9 |
+
@rem Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
@rem See the License for the specific language governing permissions and
|
| 13 |
+
@rem limitations under the License.
|
| 14 |
+
|
| 15 |
+
@ECHO OFF
|
| 16 |
+
|
| 17 |
+
pushd %~dp0
|
| 18 |
+
|
| 19 |
+
REM Command file for Sphinx documentation
|
| 20 |
+
|
| 21 |
+
if "%SPHINXBUILD%" == "" (
|
| 22 |
+
set SPHINXBUILD=sphinx-build
|
| 23 |
+
)
|
| 24 |
+
set SOURCEDIR=source
|
| 25 |
+
set BUILDDIR=build
|
| 26 |
+
|
| 27 |
+
%SPHINXBUILD% >NUL 2>NUL
|
| 28 |
+
if errorlevel 9009 (
|
| 29 |
+
echo.
|
| 30 |
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
| 31 |
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
| 32 |
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
| 33 |
+
echo.may add the Sphinx directory to PATH.
|
| 34 |
+
echo.
|
| 35 |
+
echo.If you don't have Sphinx installed, grab it from
|
| 36 |
+
echo.https://www.sphinx-doc.org/
|
| 37 |
+
exit /b 1
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
if "%1" == "" goto help
|
| 41 |
+
|
| 42 |
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 43 |
+
goto end
|
| 44 |
+
|
| 45 |
+
:help
|
| 46 |
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 47 |
+
|
| 48 |
+
:end
|
| 49 |
+
popd
|
alphagenome/source/docs/source/_templates/autosummary/class.rst
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{ fullname | escape | underline }}
|
| 2 |
+
|
| 3 |
+
.. currentmodule:: {{ module }}
|
| 4 |
+
|
| 5 |
+
.. autoclass:: {{ objname }}
|
| 6 |
+
|
| 7 |
+
{% block attributes %}
|
| 8 |
+
{% if attributes %}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Attributes
|
| 12 |
+
~~~~~~~~~~
|
| 13 |
+
|
| 14 |
+
.. rubric:: Table
|
| 15 |
+
|
| 16 |
+
.. autosummary::
|
| 17 |
+
{% for item in attributes %}
|
| 18 |
+
{%- if item not in inherited_members%}
|
| 19 |
+
~{{ name }}.{{ item }}
|
| 20 |
+
{%- endif -%}
|
| 21 |
+
{%- endfor %}
|
| 22 |
+
|
| 23 |
+
{% for item in attributes %}
|
| 24 |
+
.. autoattribute:: {{ [objname, item] | join(".") }}
|
| 25 |
+
{%- endfor %}
|
| 26 |
+
|
| 27 |
+
{% endif %}
|
| 28 |
+
{% endblock %}
|
| 29 |
+
|
| 30 |
+
{% block methods %}
|
| 31 |
+
|
| 32 |
+
{% if methods %}
|
| 33 |
+
|
| 34 |
+
Methods
|
| 35 |
+
~~~~~~~
|
| 36 |
+
|
| 37 |
+
.. rubric:: Table
|
| 38 |
+
|
| 39 |
+
.. autosummary::
|
| 40 |
+
{% for item in methods %}
|
| 41 |
+
{%- if item != '__init__' %}
|
| 42 |
+
~{{ name }}.{{ item }}
|
| 43 |
+
{%- endif -%}
|
| 44 |
+
|
| 45 |
+
{%- endfor %}
|
| 46 |
+
|
| 47 |
+
{% for item in methods %}
|
| 48 |
+
{%- if item != '__init__'%}
|
| 49 |
+
.. automethod:: {{ [objname, item] | join(".") }}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{% endif %}
|
| 53 |
+
|
| 54 |
+
{% endblock %}
|
| 55 |
+
|
alphagenome/source/docs/source/api/data.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data
|
| 2 |
+
|
| 3 |
+
Classes and utilities for manipulating genomics data.
|
| 4 |
+
|
| 5 |
+
## Fold Intervals
|
| 6 |
+
|
| 7 |
+
``` {eval-rst}
|
| 8 |
+
.. currentmodule:: alphagenome
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
``` {eval-rst}
|
| 12 |
+
|
| 13 |
+
.. autosummary::
|
| 14 |
+
:toctree: generated
|
| 15 |
+
|
| 16 |
+
data.fold_intervals.Subset
|
| 17 |
+
data.fold_intervals.get_all_folds
|
| 18 |
+
data.fold_intervals.get_fold_names
|
| 19 |
+
data.fold_intervals.get_fold_intervals
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## Genome
|
| 23 |
+
|
| 24 |
+
``` {eval-rst}
|
| 25 |
+
.. currentmodule:: alphagenome
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
``` {eval-rst}
|
| 29 |
+
|
| 30 |
+
.. autosummary::
|
| 31 |
+
:toctree: generated
|
| 32 |
+
|
| 33 |
+
data.genome.Strand
|
| 34 |
+
data.genome.Interval
|
| 35 |
+
data.genome.Variant
|
| 36 |
+
data.genome.Junction
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Gene annotation
|
| 40 |
+
|
| 41 |
+
``` {eval-rst}
|
| 42 |
+
|
| 43 |
+
.. autosummary::
|
| 44 |
+
:toctree: generated
|
| 45 |
+
|
| 46 |
+
data.gene_annotation.TranscriptType
|
| 47 |
+
data.gene_annotation.extract_tss
|
| 48 |
+
data.gene_annotation.filter_transcript_type
|
| 49 |
+
data.gene_annotation.filter_protein_coding
|
| 50 |
+
data.gene_annotation.filter_to_longest_transcript
|
| 51 |
+
data.gene_annotation.filter_transcript_support_level
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Ontology
|
| 55 |
+
|
| 56 |
+
``` {eval-rst}
|
| 57 |
+
|
| 58 |
+
.. autosummary::
|
| 59 |
+
:toctree: generated
|
| 60 |
+
|
| 61 |
+
data.ontology.OntologyType
|
| 62 |
+
data.ontology.OntologyTerm
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Track data
|
| 66 |
+
|
| 67 |
+
``` {eval-rst}
|
| 68 |
+
|
| 69 |
+
.. autosummary::
|
| 70 |
+
:toctree: generated
|
| 71 |
+
|
| 72 |
+
data.track_data.TrackData
|
| 73 |
+
data.track_data.concat
|
| 74 |
+
data.track_data.interleave
|
| 75 |
+
data.track_data.metadata_to_proto
|
| 76 |
+
data.track_data.metadata_from_proto
|
| 77 |
+
data.track_data.from_protos
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Transcript
|
| 81 |
+
|
| 82 |
+
``` {eval-rst}
|
| 83 |
+
|
| 84 |
+
.. autosummary::
|
| 85 |
+
:toctree: generated
|
| 86 |
+
|
| 87 |
+
data.transcript.Transcript
|
| 88 |
+
data.transcript.TranscriptExtractor
|
| 89 |
+
```
|
alphagenome/source/docs/source/api/index.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API
|
| 2 |
+
|
| 3 |
+
``` {toctree}
|
| 4 |
+
:maxdepth: 1
|
| 5 |
+
:hidden:
|
| 6 |
+
|
| 7 |
+
data
|
| 8 |
+
models
|
| 9 |
+
interpretation
|
| 10 |
+
visualization
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
<!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
|
| 14 |
+
|
| 15 |
+
::::{grid} 1 1 2 3
|
| 16 |
+
:gutter: 2
|
| 17 |
+
|
| 18 |
+
:::{grid-item-card} Data
|
| 19 |
+
:link: data
|
| 20 |
+
:link-type: doc
|
| 21 |
+
|
| 22 |
+
Classes and utilities for manipulating genomics data.
|
| 23 |
+
:::
|
| 24 |
+
|
| 25 |
+
:::{grid-item-card} Models
|
| 26 |
+
:link: models
|
| 27 |
+
:link-type: doc
|
| 28 |
+
|
| 29 |
+
AlphaGenome client and variant scorers.
|
| 30 |
+
:::
|
| 31 |
+
|
| 32 |
+
:::{grid-item-card} Interpretation
|
| 33 |
+
:link: interpretation
|
| 34 |
+
:link-type: doc
|
| 35 |
+
|
| 36 |
+
Sequence interpretation tools (like in silico mutagenesis).
|
| 37 |
+
:::
|
| 38 |
+
|
| 39 |
+
:::{grid-item-card} Visualization
|
| 40 |
+
:link: visualization
|
| 41 |
+
:link-type: doc
|
| 42 |
+
|
| 43 |
+
Visualization and plotting tools.
|
| 44 |
+
:::
|
| 45 |
+
|
| 46 |
+
<!-- mdformat on -->
|
alphagenome/source/docs/source/api/interpretation.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Interpretation
|
| 2 |
+
|
| 3 |
+
Sequence interpretation tools (like in silico mutagenesis).
|
| 4 |
+
|
| 5 |
+
## ISM
|
| 6 |
+
|
| 7 |
+
``` {eval-rst}
|
| 8 |
+
.. module:: alphagenome.interpretation
|
| 9 |
+
.. currentmodule:: alphagenome
|
| 10 |
+
|
| 11 |
+
.. autosummary::
|
| 12 |
+
:toctree: generated
|
| 13 |
+
|
| 14 |
+
interpretation.ism.ism_variants
|
| 15 |
+
interpretation.ism.ism_matrix
|
| 16 |
+
```
|
alphagenome/source/docs/source/api/models.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Models
|
| 2 |
+
|
| 3 |
+
AlphaGenome client and variant scorers.
|
| 4 |
+
|
| 5 |
+
## DNA Client
|
| 6 |
+
|
| 7 |
+
``` {eval-rst}
|
| 8 |
+
.. currentmodule:: alphagenome
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
``` {eval-rst}
|
| 12 |
+
|
| 13 |
+
.. autosummary::
|
| 14 |
+
:toctree: generated
|
| 15 |
+
|
| 16 |
+
models.dna_client.create
|
| 17 |
+
models.dna_client.ModelVersion
|
| 18 |
+
models.dna_client.Organism
|
| 19 |
+
models.dna_client.validate_sequence_length
|
| 20 |
+
models.dna_client.DnaClient
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## DNA Output
|
| 24 |
+
|
| 25 |
+
``` {eval-rst}
|
| 26 |
+
|
| 27 |
+
.. autosummary::
|
| 28 |
+
:toctree: generated
|
| 29 |
+
|
| 30 |
+
models.dna_output.OutputType
|
| 31 |
+
models.dna_output.Output
|
| 32 |
+
models.dna_output.OutputMetadata
|
| 33 |
+
models.dna_output.VariantOutput
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Variant Scorers
|
| 37 |
+
|
| 38 |
+
``` {eval-rst}
|
| 39 |
+
|
| 40 |
+
.. autosummary::
|
| 41 |
+
:toctree: generated
|
| 42 |
+
|
| 43 |
+
models.variant_scorers.AggregationType
|
| 44 |
+
models.variant_scorers.BaseVariantScorer
|
| 45 |
+
models.variant_scorers.CenterMaskScorer
|
| 46 |
+
models.variant_scorers.ContactMapScorer
|
| 47 |
+
models.variant_scorers.GeneMaskLFCScorer
|
| 48 |
+
models.variant_scorers.GeneMaskActiveScorer
|
| 49 |
+
models.variant_scorers.GeneMaskSplicingScorer
|
| 50 |
+
models.variant_scorers.PolyadenylationScorer
|
| 51 |
+
models.variant_scorers.SpliceJunctionScorer
|
| 52 |
+
models.variant_scorers.get_recommended_scorers
|
| 53 |
+
models.variant_scorers.tidy_anndata
|
| 54 |
+
models.variant_scorers.tidy_scores
|
| 55 |
+
```
|
alphagenome/source/docs/source/api/visualization.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Visualization
|
| 2 |
+
|
| 3 |
+
Visualization and plotting tools.
|
| 4 |
+
|
| 5 |
+
## Plot
|
| 6 |
+
|
| 7 |
+
``` {eval-rst}
|
| 8 |
+
.. currentmodule:: alphagenome
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
``` {eval-rst}
|
| 12 |
+
|
| 13 |
+
.. autosummary::
|
| 14 |
+
:toctree: generated
|
| 15 |
+
|
| 16 |
+
visualization.plot.seqlogo
|
| 17 |
+
visualization.plot.plot_contact_map
|
| 18 |
+
visualization.plot.plot_track
|
| 19 |
+
visualization.plot.plot_tracks
|
| 20 |
+
visualization.plot.sashimi_plot
|
| 21 |
+
visualization.plot.pad_track
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
(visualization/plot-components)=
|
| 25 |
+
|
| 26 |
+
## Plot components
|
| 27 |
+
|
| 28 |
+
``` {eval-rst}
|
| 29 |
+
|
| 30 |
+
.. autosummary::
|
| 31 |
+
:toctree: generated
|
| 32 |
+
|
| 33 |
+
visualization.plot_components.plot
|
| 34 |
+
visualization.plot_components.AbstractComponent
|
| 35 |
+
visualization.plot_components.Tracks
|
| 36 |
+
visualization.plot_components.OverlaidTracks
|
| 37 |
+
visualization.plot_components.ContactMaps
|
| 38 |
+
visualization.plot_components.ContactMapsDiff
|
| 39 |
+
visualization.plot_components.TranscriptAnnotation
|
| 40 |
+
visualization.plot_components.SeqLogo
|
| 41 |
+
visualization.plot_components.Sashimi
|
| 42 |
+
visualization.plot_components.EmptyComponent
|
| 43 |
+
visualization.plot_components.AbstractAnnotation
|
| 44 |
+
visualization.plot_components.IntervalAnnotation
|
| 45 |
+
visualization.plot_components.VariantAnnotation
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## Plot transcripts
|
| 49 |
+
|
| 50 |
+
``` {eval-rst}
|
| 51 |
+
|
| 52 |
+
.. autosummary::
|
| 53 |
+
:toctree: generated
|
| 54 |
+
|
| 55 |
+
visualization.plot_transcripts.TranscriptStyle
|
| 56 |
+
visualization.plot_transcripts.TranscriptStylePreset
|
| 57 |
+
visualization.plot_transcripts.plot_transcripts
|
| 58 |
+
visualization.plot_transcripts.draw_interval
|
| 59 |
+
visualization.plot_transcripts.draw_transcript
|
| 60 |
+
```
|
alphagenome/source/docs/source/conf.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Configuration file for the Sphinx documentation builder."""
|
| 16 |
+
|
| 17 |
+
#
|
| 18 |
+
# This file only contains a selection of the most common options. For a full
|
| 19 |
+
# list see the documentation:
|
| 20 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
| 21 |
+
|
| 22 |
+
# -- Path setup --------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
| 25 |
+
# add these directories to sys.path here. If the directory is relative to the
|
| 26 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
| 27 |
+
#
|
| 28 |
+
import importlib.metadata
|
| 29 |
+
import inspect
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
|
| 33 |
+
# The package is installed by Readthedocs before sphinx building.
|
| 34 |
+
import alphagenome # pylint: disable=unused-import, g-import-not-at-top
|
| 35 |
+
import alphagenome.models.dna_client # pylint: disable=unused-import, g-import-not-at-top
|
| 36 |
+
|
| 37 |
+
# -- Project information -----------------------------------------------------
|
| 38 |
+
|
| 39 |
+
project = 'alphagenome'
|
| 40 |
+
project_info = importlib.metadata.metadata(project)
|
| 41 |
+
author = project_info['Author']
|
| 42 |
+
copyright = f'2024, {author}' # pylint: disable=redefined-builtin
|
| 43 |
+
version = project_info['Version']
|
| 44 |
+
repository_url = f'https://github.com/google-deepmind/{project}'
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# The full version, including alpha/beta/rc tags
|
| 48 |
+
release = version
|
| 49 |
+
# Warn if links are broken
|
| 50 |
+
nitpicky = True
|
| 51 |
+
|
| 52 |
+
# -- General configuration ---------------------------------------------------
|
| 53 |
+
|
| 54 |
+
# Add any Sphinx extension module names here, as strings. They can be
|
| 55 |
+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
| 56 |
+
# ones.
|
| 57 |
+
extensions = [
|
| 58 |
+
'myst_nb',
|
| 59 |
+
'sphinx_design',
|
| 60 |
+
'sphinx.ext.autodoc',
|
| 61 |
+
'sphinx.ext.intersphinx',
|
| 62 |
+
'sphinx.ext.autosummary',
|
| 63 |
+
'sphinx.ext.napoleon',
|
| 64 |
+
'sphinxcontrib.bibtex',
|
| 65 |
+
'sphinx_autodoc_typehints',
|
| 66 |
+
'sphinx.ext.mathjax',
|
| 67 |
+
'IPython.sphinxext.ipython_console_highlighting',
|
| 68 |
+
'sphinx.ext.coverage',
|
| 69 |
+
'sphinx_copybutton',
|
| 70 |
+
'sphinx_remove_toctrees',
|
| 71 |
+
'sphinx.ext.linkcode',
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
autosummary_generate = True
|
| 75 |
+
autodoc_member_order = 'groupwise'
|
| 76 |
+
default_role = 'literal'
|
| 77 |
+
bibtex_reference_style = 'author_year'
|
| 78 |
+
napoleon_google_docstring = True
|
| 79 |
+
napoleon_numpy_docstring = False
|
| 80 |
+
napoleon_include_init_with_doc = False
|
| 81 |
+
napoleon_use_rtype = True
|
| 82 |
+
napoleon_use_param = True
|
| 83 |
+
myst_heading_anchors = 6 # Create heading anchors for h1-h6
|
| 84 |
+
autodoc_mock_imports = [
|
| 85 |
+
'google.protobuf.runtime_version',
|
| 86 |
+
'google.protobuf.internal.builder',
|
| 87 |
+
'absl',
|
| 88 |
+
'alphagenome.protos',
|
| 89 |
+
]
|
| 90 |
+
remove_from_toctrees = ['api/generated/*']
|
| 91 |
+
bibtex_bibfiles = ['refs.bib']
|
| 92 |
+
|
| 93 |
+
myst_enable_extensions = [
|
| 94 |
+
'amsmath',
|
| 95 |
+
'colon_fence',
|
| 96 |
+
'deflist',
|
| 97 |
+
'dollarmath',
|
| 98 |
+
'html_image',
|
| 99 |
+
'html_admonition',
|
| 100 |
+
'attrs_inline',
|
| 101 |
+
'attrs_block',
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
# TODO(b/372225132): Resolve showing notebook output without executing.
|
| 105 |
+
# TODO(b/372226231): Resolve not modifying notebook when building docs.
|
| 106 |
+
myst_url_schemes = ['http', 'https', 'mailto']
|
| 107 |
+
nb_output_stderr = 'remove'
|
| 108 |
+
nb_execution_mode = 'off'
|
| 109 |
+
nb_merge_streams = True
|
| 110 |
+
typehints_defaults = 'braces'
|
| 111 |
+
|
| 112 |
+
source_suffix = {
|
| 113 |
+
'.rst': 'restructuredtext',
|
| 114 |
+
'.ipynb': 'myst-nb',
|
| 115 |
+
'.myst': 'myst-nb',
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
intersphinx_mapping = {
|
| 119 |
+
'python': ('https://docs.python.org/3', None),
|
| 120 |
+
'anndata': ('https://anndata.readthedocs.io/en/stable/', None),
|
| 121 |
+
'numpy': ('https://numpy.org/doc/stable/', None),
|
| 122 |
+
'jax': ('https://jax.readthedocs.io/en/latest/', None),
|
| 123 |
+
'pandas': ('https://pandas.pydata.org/docs/', None),
|
| 124 |
+
'matplotlib': ('https://matplotlib.org/stable/', None),
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
# Add any paths that contain templates here, relative to this directory.
|
| 128 |
+
templates_path = ['_templates']
|
| 129 |
+
|
| 130 |
+
# List of patterns, relative to source directory, that match files and
|
| 131 |
+
# directories to ignore when looking for source files.
|
| 132 |
+
# This pattern also affects html_static_path and html_extra_path.
|
| 133 |
+
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'protos']
|
| 134 |
+
|
| 135 |
+
# -- Options for autodoc -----------------------------------------------------
|
| 136 |
+
|
| 137 |
+
autodoc_default_options = {
|
| 138 |
+
'member-order': 'bysource',
|
| 139 |
+
'special-members': True,
|
| 140 |
+
'exclude-members': '__repr__, __str__, __weakref__',
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# -- Source code links -----------------------------------------------------
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def linkcode_resolve(domain, info):
|
| 148 |
+
"""Resolve a GitHub URL corresponding to Python object."""
|
| 149 |
+
if domain != 'py':
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
mod = sys.modules[info['module']]
|
| 154 |
+
except ImportError:
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
obj = mod
|
| 158 |
+
try:
|
| 159 |
+
for attr in info['fullname'].split('.'):
|
| 160 |
+
obj = getattr(obj, attr)
|
| 161 |
+
except AttributeError:
|
| 162 |
+
return None
|
| 163 |
+
else:
|
| 164 |
+
obj = inspect.unwrap(obj)
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
filename = inspect.getsourcefile(obj)
|
| 168 |
+
except TypeError:
|
| 169 |
+
return None
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
source, lineno = inspect.getsourcelines(obj)
|
| 173 |
+
except OSError:
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
path = os.path.relpath(filename, start=os.path.dirname(alphagenome.__file__))
|
| 177 |
+
return (
|
| 178 |
+
f'{repository_url}/tree/main/src/{project}/'
|
| 179 |
+
f'{path}#L{lineno}#L{lineno + len(source) - 1}'
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# -- Options for HTML output -------------------------------------------------
|
| 184 |
+
|
| 185 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
| 186 |
+
# a list of builtin themes.
|
| 187 |
+
|
| 188 |
+
html_theme = 'sphinx_book_theme'
|
| 189 |
+
html_title = 'AlphaGenome'
|
| 190 |
+
pygments_style = 'default'
|
| 191 |
+
html_theme_options = {
|
| 192 |
+
'repository_url': repository_url,
|
| 193 |
+
'repository_branch': 'main',
|
| 194 |
+
'use_repository_button': True,
|
| 195 |
+
'launch_buttons': {
|
| 196 |
+
'colab_url': 'https://colab.research.google.com',
|
| 197 |
+
},
|
| 198 |
+
'article_header_start': ['toggle-primary-sidebar.html', 'breadcrumbs'],
|
| 199 |
+
'show_prev_next': False,
|
| 200 |
+
}
|
| 201 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
| 202 |
+
# relative to this directory. They are copied after the builtin static files,
|
| 203 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
| 204 |
+
# html_static_path = ['_static']
|
| 205 |
+
|
| 206 |
+
# TODO: b/377291190 - Look at adding notebook support (see haiku example)
|
alphagenome/source/docs/source/exploring_model_metadata.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model output metadata
|
| 2 |
+
|
| 3 |
+
AlphaGenome returns predictions for 11 different output types, covering a
|
| 4 |
+
variety of modalities. Here we provide details about the human model outputs and
|
| 5 |
+
associated metadata to help users make informed decisions about the parameters
|
| 6 |
+
of their API requests (e.g., ontology term; output types).
|
| 7 |
+
|
| 8 |
+
For further details on dataset processing and precise definitions of each output
|
| 9 |
+
type, including their respective units and normalization methods, please refer
|
| 10 |
+
to the Methods section of the AlphaGenome paper.
|
| 11 |
+
|
| 12 |
+
<!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
|
| 13 |
+
<!-- mdlint off(LINK_ID) -->
|
| 14 |
+
|
| 15 |
+
{#model_metadata-table1-target}
|
| 16 |
+
**Table 1**: Descriptions of output types predicted by AlphaGenome.
|
| 17 |
+
|
| 18 |
+
| OutputType name | Description | Units | Resolution | Unique biosamples | Total tracks |
|
| 19 |
+
| :--- | :--- | :--- | :--- | :--- | :--- |
|
| 20 |
+
| [RNA\_SEQ](#alphagenome.models.dna_output.OutputType.RNA_SEQ) | RNA expression as measured by RNA-seq. Includes a mixture of PolyA+ RNA and Total RNA assays. Some tracks are also stranded. | Normalized read signal | 1bp | 285 | 667 |
|
| 21 |
+
| [CAGE](#alphagenome.models.dna_output.OutputType.CAGE) | RNA expression at transcription start-sites as measured by Cap Analysis Gene Expression (CAGE) assay. | Normalized read signal | 1bp | 264 | 546 |
|
| 22 |
+
| [PROCAP](#alphagenome.models.dna_output.OutputType.PROCAP) | RNA expression at transcription start-sites as measured by Precision Run-On sequencing and capping (PROCAP) assay. | Normalized read signal | 1bp | 6 | 12 |
|
| 23 |
+
| [DNASE](#alphagenome.models.dna_output.OutputType.DNASE) | Chromatin accessibility as measured by DNase I hypersensitive sites sequencing (DNase-seq) assay. | Normalized insertion signal | 1bp | 305 | 305 |
|
| 24 |
+
| [ATAC](#alphagenome.models.dna_output.OutputType.ATAC) | Chromatin accessibility as measured by the transposase-accessible chromatin (ATAC-seq) assay. | Normalized insertion signal | 1bp | 167 | 167 |
|
| 25 |
+
| [CHIP\_HISTONE](#alphagenome.models.dna_output.OutputType.CHIP_HISTONE) | Relative abundance of histone modification marks as measured by chromatin immunoprecipitation (ChIP-seq) for 24 different markers e.g. H3k27ac (see ENCODE [documentation](https://www.encodeproject.org/chip-seq/histone/)). | Fold-change over control | 128bp | 219 | 1116 |
|
| 26 |
+
| [CHIP\_TF](#alphagenome.models.dna_output.OutputType.CHIP_TF) | Relative abundance of DNA-bound transcription factors as measured by ChIP-seq targeting 43 different proteins (see ENCODE [documentation](https://www.encodeproject.org/chip-seq/transcription_factor/)). | Fold-change over control | 128bp | 163 | 1617 |
|
| 27 |
+
| [SPLICE\_SITES](#alphagenome.models.dna_output.OutputType.SPLICE_SITES) | Predicted location of donor or acceptor splice sites, for both the positive and negative strand, expressed as a probability (higher numbers indicate higher probability of the base being a splice site). | Predicted probability | 1bp | NA | 4 |
|
| 28 |
+
| [SPLICE\_JUNCTIONS](#alphagenome.models.dna_output.OutputType.SPLICE_JUNCTIONS) | Splice junction spliced read counts, as measured by RNA-Seq. Predictions are for all possible pairings of at most 512 donors and 512 acceptors from each strand in the requested interval, where the position of donors and acceptors along the input sequence is given by predictions of splice site positions. | Normalized junction signal | 1bp | 282 | 734 |
|
| 29 |
+
| [SPLICE\_SITE\_USAGE](#alphagenome.models.dna_output.OutputType.SPLICE_SITE_USAGE) | Fraction of transcripts using a splice site, as measured by RNA-seq. All reads that span a given splice site are considered, and we predict the fraction of these that use the site (donor or acceptor). | Fraction | 1bp | 282 | 734 |
|
| 30 |
+
| [CONTACT\_MAPS](#alphagenome.models.dna_output.OutputType.CONTACT_MAPS) | Relative frequency of physical contact between pairwise positions (symmetric), derived from chromatin contact maps (Micro-C and Hi-C assays). Values are coarse-grained and normalized by removing the off-diagonal power law decay (as also done in [Zhou, J. 2022](https://www.nature.com/articles/s41588-022-01065-4)). | Log-fold over genomic distance-based expectation | 2048bp | 12 | 28 |
|
| 31 |
+
<!-- mdlint on -->
|
| 32 |
+
<!-- mdformat on -->
|
| 33 |
+
|
| 34 |
+
## Track metadata
|
| 35 |
+
|
| 36 |
+
To access the metadata describing each track for human outputs use:
|
| 37 |
+
|
| 38 |
+
```py
|
| 39 |
+
output_metadata = dna_model.output_metadata(
|
| 40 |
+
organism=dna_client.Organism.HOMO_SAPIENS
|
| 41 |
+
)
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Each predicted output type (e.g., RNA\_SEQ) contains metadata in a
|
| 45 |
+
{py:class}`~pandas.DataFrame`: {py:class}`output_metadata.rna_seq
|
| 46 |
+
<alphagenome.models.dna_output.OutputMetadata.rna_seq>`
|
| 47 |
+
|
| 48 |
+
Each row of the {class}`~pandas.DataFrame` corresponds to a ‘track’, and each
|
| 49 |
+
column contains key information for biological interpretation such as:
|
| 50 |
+
|
| 51 |
+
* `name`: Name of the track. Example: `CL:0000047 polyA plus RNA-seq`.
|
| 52 |
+
* `strand` Strand of the track, either positive (`+`), negative (`-`), or
|
| 53 |
+
unstranded (`.`).
|
| 54 |
+
* `ontology_curie`: A string ID representing the ontology term corresponding
|
| 55 |
+
to the biosample. Example: `CL:0000100`.
|
| 56 |
+
* `biosample_name`: Plain text description of the biosample. Example: `motor
|
| 57 |
+
neuron`.
|
| 58 |
+
|
| 59 |
+
For a full list of metadata columns available for each output type, please see
|
| 60 |
+
the [navigating data ontologies notebook](colabs/tissue_ontology_mapping), which
|
| 61 |
+
demonstrates how to access and browse track metadata.
|
| 62 |
+
|
| 63 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 64 |
+
|
| 65 |
+
:::{note} For `SPLICE_JUNCTION` outputs the strand information is a property of
|
| 66 |
+
a junction rather than a track, so the metadata for this output type will show
|
| 67 |
+
half as many rows as reported in the above table.
|
| 68 |
+
:::
|
| 69 |
+
|
| 70 |
+
<!-- mdformat on -->
|
| 71 |
+
|
| 72 |
+
## Additional track metadata
|
| 73 |
+
|
| 74 |
+
Some output types contain additional columns. For example,
|
| 75 |
+
{py:class}`OutputMetadata.rna_seq
|
| 76 |
+
<alphagenome.models.dna_output.OutputMetadata.rna_seq>` and
|
| 77 |
+
{py:class}`OutputMetadata.splice_sites
|
| 78 |
+
<alphagenome.models.dna_output.OutputMetadata.splice_sites>` also contain a
|
| 79 |
+
`gtex_tissue` column, which is populated for the tracks that make predictions
|
| 80 |
+
for the tissues sampled in the
|
| 81 |
+
[GTEx project](https://gtexportal.org/home/samplingSitePage)
|
| 82 |
+
{cite:t}`gtex2020gtex`.
|
| 83 |
+
|
| 84 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 85 |
+
:::{note} For one tissue,
|
| 86 |
+
’Brain \- Cerebellar hemisphere’, we used an alternative Uberon ID to that was
|
| 87 |
+
provided in the
|
| 88 |
+
[GTEx documentation](https://gtexportal.org/home/samplingSitePage)
|
| 89 |
+
(‘UBERON:0002037’), to reflect Uberon’s ID for cerebellar hemisphere:
|
| 90 |
+
[‘UBERON:0002245'](https://www.ebi.ac.uk/ols4/ontologies/uberon/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FUBERON_0002245).
|
| 91 |
+
:::
|
| 92 |
+
|
| 93 |
+
<!-- mdformat on -->
|
alphagenome/source/docs/source/faqs.md
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FAQ
|
| 2 |
+
|
| 3 |
+
Frequently asked questions.
|
| 4 |
+
|
| 5 |
+
## Model inputs
|
| 6 |
+
|
| 7 |
+
### How do I make predictions for a specific genomic region?
|
| 8 |
+
|
| 9 |
+
You can define any region in either the human or mouse genome, and use the API
|
| 10 |
+
to predict various outputs. See the [quick start colab](colabs/quick_start) for
|
| 11 |
+
a demonstration.
|
| 12 |
+
|
| 13 |
+
### How do I specify a genomic region?
|
| 14 |
+
|
| 15 |
+
Using the {class}`genome.Interval<alphagenome.data.genome.Interval>` class,
|
| 16 |
+
which is initialized with a chromosome, a start, and an end position.
|
| 17 |
+
|
| 18 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 19 |
+
|
| 20 |
+
:::{note}
|
| 21 |
+
AlphaGenome classes such as {class}`genome.Interval<alphagenome.data.genome.Interval>`
|
| 22 |
+
uses 0-based indexing, consistent with the underlying Python implementations.
|
| 23 |
+
|
| 24 |
+
This means an
|
| 25 |
+
{class}`genome.Interval<alphagenome.data.genome.Interval>` includes the base
|
| 26 |
+
pair at the `start` position up to the base pair at the `end-1` position.
|
| 27 |
+
|
| 28 |
+
For example, to specify the first base pair of chromosome 1, use
|
| 29 |
+
`genome.Interval('chr1', 0, 1)`. This interval has a width of 1, and contains
|
| 30 |
+
only the base pair at the first position of chromosome 1.
|
| 31 |
+
|
| 32 |
+
To interpret interval overlaps, remember that 0-based indexing excludes the base
|
| 33 |
+
pair at the `end` position itself, such that
|
| 34 |
+
`genome.Interval('chr1', 0, 1).overlaps(genome.Interval('chr1', 1, 2))`
|
| 35 |
+
returns `False`.
|
| 36 |
+
:::
|
| 37 |
+
|
| 38 |
+
<!-- mdformat on -->
|
| 39 |
+
|
| 40 |
+
### What are the reference genome versions used by the model?
|
| 41 |
+
|
| 42 |
+
We use human genome assembly hg38 (GRCh38.p13.genome.fa) and mouse assembly mm10
|
| 43 |
+
(GRCm38.p6.genome.fa). For other genome builds (such as hg19, for example), the
|
| 44 |
+
[LiftOver](https://genome.ucsc.edu/cgi-bin/hgLiftOver) tool can be used to
|
| 45 |
+
convert from hg38 coordinates to the desired assembly.
|
| 46 |
+
|
| 47 |
+
### Can I make a prediction for any arbitrary DNA sequence?
|
| 48 |
+
|
| 49 |
+
Yes, you can make predictions for any sequence, provided it is within the range
|
| 50 |
+
of sequence lengths supported by the model. Note that model predictions have
|
| 51 |
+
only been evaluated using sequences that vary by a relatively small amount from
|
| 52 |
+
the reference genome (SNPs and indels), so very large differences from the human
|
| 53 |
+
reference genome (for example, structural variants, sequences with a large
|
| 54 |
+
amount of padding, synthetic sequences, or artificial DNA constructs) may result
|
| 55 |
+
in predictions that are not as reliable.
|
| 56 |
+
|
| 57 |
+
### Can I make predictions for DNA from other species?
|
| 58 |
+
|
| 59 |
+
Yes, with the caveat that the model has only been trained on mouse and human
|
| 60 |
+
DNA. Prediction quality is likely to degrade as evolutionary distance from these
|
| 61 |
+
two species increases, but note that this has not been formally benchmarked.
|
| 62 |
+
|
| 63 |
+
### What is the longest sequence the model can take as input?
|
| 64 |
+
|
| 65 |
+
1MB (precisely 2^20 base-pairs long). Other sequence lengths are also supported:
|
| 66 |
+
\~2KB, \~16KB, \~100KB, \~500KB.
|
| 67 |
+
|
| 68 |
+
### How do I request predictions for a sequence with a length that is not in the list of supported lengths?
|
| 69 |
+
|
| 70 |
+
You can use
|
| 71 |
+
{func}`genome.Interval.resize<alphagenome.data.genome.Interval.resize>` to crop
|
| 72 |
+
or expand your sequence length to the nearest supported length.
|
| 73 |
+
|
| 74 |
+
Note that `.resize` expands sequences using the actual surrounding genomic data,
|
| 75 |
+
not by adding padding.
|
| 76 |
+
|
| 77 |
+
## Model outputs
|
| 78 |
+
|
| 79 |
+
### How many tracks are there per output type and what do they represent?
|
| 80 |
+
|
| 81 |
+
This varies from 5 to over 600. Each of the tracks refers to a particular
|
| 82 |
+
cell-type or tissue, as well as other properties, such as strand or a specific
|
| 83 |
+
transcription factor (for the `CHIP_TF` output type). See the
|
| 84 |
+
[output metadata documentation](project:exploring_model_metadata.md#Exploring-model-metadata)
|
| 85 |
+
for a full list of the output types.
|
| 86 |
+
|
| 87 |
+
### How do I find out what tissue or cell-type an output ‘track’ refers to?
|
| 88 |
+
|
| 89 |
+
Using the [navigating data ontologies notebook](colabs/tissue_ontology_mapping),
|
| 90 |
+
you can look at the output metadata where biosample names and ontology CURIEs
|
| 91 |
+
(IDs) for each track are described.
|
| 92 |
+
|
| 93 |
+
### What is an ontology CURIE?
|
| 94 |
+
|
| 95 |
+
CURIEs (Compact Uniform Resource Identifiers) are standardized, abbreviated
|
| 96 |
+
codes (e.g., ‘UBERON:0001114’ for liver) that uniquely identify specific
|
| 97 |
+
ontology terms.
|
| 98 |
+
|
| 99 |
+
### Where are your ontology CURIEs sourced from?
|
| 100 |
+
|
| 101 |
+
We source these from the IDs provided in the source training data. We also
|
| 102 |
+
restricted the ontology types to UBERON, CL, CLO and EFO, following ENCODE
|
| 103 |
+
practices. We recommend using EBI's
|
| 104 |
+
[Ontology Lookup Service](https://www.ebi.ac.uk/ols4) to understand
|
| 105 |
+
relationships between the ontology IDs for different tracks.
|
| 106 |
+
|
| 107 |
+
### What is strandedness?
|
| 108 |
+
|
| 109 |
+
DNA is double-stranded, meaning that there are two nucleotide strands that form
|
| 110 |
+
the double helix. By convention, one of those molecules is designated the
|
| 111 |
+
forward, or positive strand (5'->3'), and the other is designated the reverse,
|
| 112 |
+
or negative strand (3'->5').
|
| 113 |
+
|
| 114 |
+
Genomic assays can either be unstranded or stranded (also called
|
| 115 |
+
strand-specific).
|
| 116 |
+
|
| 117 |
+
* Unstranded assays return results that do not distinguish whether a
|
| 118 |
+
measurement came from the positive or negative strand. Certain assays do not
|
| 119 |
+
generate stranded information – for example, ATAC-seq generates unstranded
|
| 120 |
+
accessibility information.
|
| 121 |
+
* Stranded (or strand-specific) assays annotate each measurement as coming
|
| 122 |
+
from the positive or negative strand. This is important for transcriptional
|
| 123 |
+
assays to distinguish between strand-specific transcripts (for example, two
|
| 124 |
+
transcripts that share a transcriptional start site but are on different
|
| 125 |
+
strands).
|
| 126 |
+
|
| 127 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 128 |
+
|
| 129 |
+
:::{note}
|
| 130 |
+
Not all RNA-seq samples will be stranded, especially those that are
|
| 131 |
+
from older experiments. For example, GTEx RNA-seq data is unstranded.
|
| 132 |
+
:::
|
| 133 |
+
|
| 134 |
+
<!-- mdformat on -->
|
| 135 |
+
|
| 136 |
+
For more general information about the difference between non-stranded and
|
| 137 |
+
stranded protocols and how to interpret them, there is a helpful tutorial
|
| 138 |
+
[here](https://www.ecseq.com/support/ngs/how-do-strand-specific-sequencing-protocols-work).
|
| 139 |
+
|
| 140 |
+
### How is strandedness handled in model outputs?
|
| 141 |
+
|
| 142 |
+
In the model output metadata, we use the following symbols to designate the
|
| 143 |
+
strand of a track:
|
| 144 |
+
|
| 145 |
+
* positive: `+`
|
| 146 |
+
* negative: `-`
|
| 147 |
+
* unstranded `.`
|
| 148 |
+
|
| 149 |
+
For assays that were performed in a stranded (or strand-specific) manner, the
|
| 150 |
+
assay will have two tracks per cell or tissue type: one for the positive (`+`)
|
| 151 |
+
and another for the negative (`-`) strand.
|
| 152 |
+
|
| 153 |
+
For unstranded assays, there will be a single track per cell or tissue type,
|
| 154 |
+
annotated as unstranded (`.`).
|
| 155 |
+
|
| 156 |
+
We provide convenience operations for manipulating
|
| 157 |
+
{class}`~alphagenome.data.track_data.TrackData` based on strand information,
|
| 158 |
+
such as
|
| 159 |
+
{func}`~alphagenome.data.track_data.TrackData.filter_to_negative_strand`, etc.
|
| 160 |
+
|
| 161 |
+
### How can I save the model outputs?
|
| 162 |
+
|
| 163 |
+
For *variant effect predictions*: We recommend converting the scores into a
|
| 164 |
+
pandas DataFrame. This DataFrame can then be easily exported to a common file
|
| 165 |
+
format, such as a CSV file, for use with other tools or for record-keeping.
|
| 166 |
+
Specific instructions and examples for this process are provided in our 'Variant
|
| 167 |
+
Scoring UI' tutorial.
|
| 168 |
+
|
| 169 |
+
For *genome track predictions (e.g., RNA-seq levels)*: The predicted track data
|
| 170 |
+
is provided as NumPy arrays within TrackData objects. These arrays can be
|
| 171 |
+
directly saved to disk using standard NumPy functions, such as `numpy.save` (for
|
| 172 |
+
saving a single array to a `.npy` file) or `numpy.savez_compressed` (for saving
|
| 173 |
+
multiple arrays into a single compressed `.npz` file).
|
| 174 |
+
|
| 175 |
+
### What are some of the limitations of the model?
|
| 176 |
+
|
| 177 |
+
AlphaGenome has several key limitations:
|
| 178 |
+
|
| 179 |
+
- *Tissue-specificity and long-range interactions*: While AlphaGenome shows
|
| 180 |
+
improvements in these areas compared to previous models, accurately
|
| 181 |
+
capturing tissue-specific effects and long-range genomic interactions
|
| 182 |
+
remains challenging for deep learning models in genomics, requiring further
|
| 183 |
+
research.
|
| 184 |
+
- *Species scope*: The model is trained and evaluated on human and mouse DNA.
|
| 185 |
+
Its performance on DNA from other species has not been determined.
|
| 186 |
+
- *Personal genomes*: The model has not yet been benchmarked for predicting
|
| 187 |
+
individual (personal) human genomes.
|
| 188 |
+
- *Molecular scope*: AlphaGenome predicts the molecular consequences of
|
| 189 |
+
genetic variations. Its direct applicability to complex trait analysis is
|
| 190 |
+
limited, as these traits also involve broader biological processes (e.g.,
|
| 191 |
+
gene function, development, environmental factors) beyond the model's
|
| 192 |
+
primary focus.
|
| 193 |
+
- *Unphased training and single sequence input*: The model processes a single
|
| 194 |
+
DNA sequence at a time and is therefore not inherently 'diploid-aware'. It
|
| 195 |
+
was trained using unphased data, meaning it could not learn to distinguish
|
| 196 |
+
between alleles inherited from the mother versus the father. Consequently,
|
| 197 |
+
its variant effect predictions do not inherently model heterozygous states
|
| 198 |
+
(i.e., the presence of both a reference and a variant allele at a site
|
| 199 |
+
simultaneously).
|
| 200 |
+
|
| 201 |
+
## Visualizing predictions
|
| 202 |
+
|
| 203 |
+
### How do I visualize the predicted output?
|
| 204 |
+
|
| 205 |
+
You can use any tool to visualize the numerical output, but we provide a Python
|
| 206 |
+
[visualization library](project:api/visualization.md#Visualization) so you can
|
| 207 |
+
easily visualize the output immediately. You can use our
|
| 208 |
+
[visualization basics guide](project:visualization_library_basics.md) and see
|
| 209 |
+
examples of how to plot different modalities in our
|
| 210 |
+
[visualizing predictions tutorial](colabs/visualization_modality_tour).
|
| 211 |
+
|
| 212 |
+
### Can I design my own visualizations to work with this library?
|
| 213 |
+
|
| 214 |
+
Yes. The returned figures are based on matplotlib, so should be extendible.
|
| 215 |
+
Additionally, you can choose to work with the raw output data and design your
|
| 216 |
+
own visualizations.
|
| 217 |
+
|
| 218 |
+
### Where are the plotted transcript annotations from?
|
| 219 |
+
|
| 220 |
+
Transcript annotations are sourced from standard Gene Transfer Format (GTF)
|
| 221 |
+
files from GENCODE: the hg38 reference assembly (release 46) for human and the
|
| 222 |
+
mm10 reference assembly (release M23) for mouse.
|
| 223 |
+
|
| 224 |
+
### Am I limited to only plotting protein-coding genes, and only the longest transcript?
|
| 225 |
+
|
| 226 |
+
No. If you wish to include other gene types or all transcripts (not just the
|
| 227 |
+
longest), you can remove the respective calls to
|
| 228 |
+
`gene_annotation.filter_protein_coding(gtf)` and
|
| 229 |
+
`gene_annotation.filter_to_longest_transcript(gtf)` in your code. Note that
|
| 230 |
+
including more transcripts can make the plot appear busy; you can adjust the
|
| 231 |
+
`fig_height` parameter of the `TranscriptAnnotation` plot component to improve
|
| 232 |
+
legibility.
|
| 233 |
+
|
| 234 |
+
## Variant scoring
|
| 235 |
+
|
| 236 |
+
### How do I define a variant?
|
| 237 |
+
|
| 238 |
+
By creating a {class}`~alphagenome.data.genome.Variant` object.
|
| 239 |
+
|
| 240 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 241 |
+
|
| 242 |
+
:::{note}
|
| 243 |
+
:name: variant-position-is-1-based
|
| 244 |
+
As mentioned above, AlphaGenome classes such as
|
| 245 |
+
{class}`~alphagenome.data.genome.Variant` use 0-indexing, and Variant's
|
| 246 |
+
{func}`~alphagenome.data.genome.Variant.start` and
|
| 247 |
+
{func}`~alphagenome.data.genome.Variant.end` contain 0-indexed values.
|
| 248 |
+
|
| 249 |
+
However, most variants in public databases, such as dbSNP, are provided as
|
| 250 |
+
1-indexed.
|
| 251 |
+
|
| 252 |
+
To enable compatibility with these annotations, the
|
| 253 |
+
{class}`~alphagenome.data.genome.Variant` object is initialized with a
|
| 254 |
+
1-indexed {attr}`~alphagenome.data.genome.Variant.position` attribute, which is
|
| 255 |
+
then converted to 0-indexing internally. (i.e.,
|
| 256 |
+
{func}`~alphagenome.data.genome.Variant.start` returns
|
| 257 |
+
{attr}`~alphagenome.data.genome.Variant.position` - 1).
|
| 258 |
+
|
| 259 |
+
See the {class}`~alphagenome.data.genome.Variant` docstring for more details.
|
| 260 |
+
:::
|
| 261 |
+
|
| 262 |
+
<!-- mdformat on -->
|
| 263 |
+
|
| 264 |
+
### Are there tools to help me define variants, and run inference for them?
|
| 265 |
+
|
| 266 |
+
See the
|
| 267 |
+
[scoring and visualizing a single variant notebook](colabs/variant_scoring_ui)
|
| 268 |
+
which walks through how to define a {class}`~alphagenome.data.genome.Variant`
|
| 269 |
+
object and perform inference. Batch inference over many variants can be
|
| 270 |
+
performed using the
|
| 271 |
+
[batch variant scoring notebook](colabs/batch_variant_scoring) which takes a
|
| 272 |
+
variant call file (VCF) as input.
|
| 273 |
+
|
| 274 |
+
### Can I pass any sequence to {class}`~alphagenome.data.genome.Variant.reference_bases` or does it have to match the reference genome sequence at the variant location?
|
| 275 |
+
|
| 276 |
+
You can pass any sequence to
|
| 277 |
+
{class}`~alphagenome.data.genome.Variant.reference_bases`. Note that
|
| 278 |
+
{func}`~alphagenome.models.dna_client.DnaClient.predict_variant` is agnostic to
|
| 279 |
+
the alleles in the reference genome, but rather uses the REF/ALT alleles
|
| 280 |
+
specified by the user.
|
| 281 |
+
|
| 282 |
+
### Are variant predictions for insertions and deletions (indels) supported?
|
| 283 |
+
|
| 284 |
+
Yes. We use left-alignment to specify indels. See
|
| 285 |
+
{class}`~alphagenome.data.genome.Variant` for more details. For scoring indels,
|
| 286 |
+
we adopt SpliceAI's {cite:p}`spliceai` indel alignment strategy: inserted bases
|
| 287 |
+
are summarized by taking the maximum value over the inserted segment, while
|
| 288 |
+
deleted bases are treated as having zero signal in the `ALT` context, thereby
|
| 289 |
+
enabling consistent positional comparisons.
|
| 290 |
+
|
| 291 |
+
### Which variant scorer should I use for a given modality?
|
| 292 |
+
|
| 293 |
+
In practice, you can use most variant scoring strategies for any modality.
|
| 294 |
+
However, we provide a recommendation for the best strategies based on our
|
| 295 |
+
evaluations in the
|
| 296 |
+
[variant scoring documentation](project:variant_scoring.md#variant-scoring).
|
| 297 |
+
|
| 298 |
+
### Can I write my own variant scoring strategy?
|
| 299 |
+
|
| 300 |
+
We do not currently support users writing their own variant scoring strategy.
|
| 301 |
+
However, since variant scoring is simply aggregating REF and ALT track
|
| 302 |
+
predictions, you can write your own methods for handling these values.
|
| 303 |
+
|
| 304 |
+
### What is the difference between a 'quantile_score' and 'raw_score'?
|
| 305 |
+
|
| 306 |
+
The 'raw_score' is the output for a particular variant scoring strategy.
|
| 307 |
+
However, different tracks and modalities yield scores that are on different
|
| 308 |
+
scales. For instance, the
|
| 309 |
+
[Splice Sites Usage scorer](project:variant_scoring.md#splicing-splice-site-usage)
|
| 310 |
+
returns values between 0 and 1, whereas the
|
| 311 |
+
[Gene Expression (RNA-seq)](project:variant_scoring.md#gene-expression-rna-seq)
|
| 312 |
+
scorer returns negative or positive values without bounds. To facilitate
|
| 313 |
+
comparisons across tracks and different variant scoring strategies, we use an
|
| 314 |
+
empirical quantiles approach (see {cite:p}`alphagenome` for full details).
|
| 315 |
+
Briefly, we estimate a background distribution for each variant scorer and track
|
| 316 |
+
using scores for common variants (MAF>0.01 in any GnomAD v3 population). We can
|
| 317 |
+
then convert any 'raw score' into a 'quantile score', representing its rank
|
| 318 |
+
within this background distribution. E.g. a variant with a quantile score of
|
| 319 |
+
0.99 has a score equivalent to the 99th percentile of common variants. This
|
| 320 |
+
provides a measure of predicted impact that is standardized to the same scale
|
| 321 |
+
across different variant scorers and tracks. The maximum (or minimum) value
|
| 322 |
+
never exceeds 0.999990 (or -0.999990), due to the number of variants used to
|
| 323 |
+
compute the quantiles (~300K). Because of this, we recommend using quantile
|
| 324 |
+
scores as an indicator of whether the raw score is unusually large, and use the
|
| 325 |
+
'raw scores' as a measure of magnitude of the effect for a given scorer and
|
| 326 |
+
track.
|
| 327 |
+
|
| 328 |
+
For signed variant scores (which indicate effect direction like up-regulation or
|
| 329 |
+
down-regulation), their [0,1] quantile probabilities – derived directly from the
|
| 330 |
+
rank order of the original signed raw scores – are linearly transformed to a
|
| 331 |
+
[-1,1] range. This rescaling ensures the quantile score reflects the
|
| 332 |
+
directionality of the raw score. For instance, the 0th percentile (representing
|
| 333 |
+
the most negative raw scores) maps to -1, the 50th percentile (raw scores around
|
| 334 |
+
zero) to 0, and the 100th percentile (most positive raw scores) to +1.
|
| 335 |
+
|
| 336 |
+
Note that quantile scores are only available for the suite of recommended
|
| 337 |
+
scorers.
|
| 338 |
+
|
| 339 |
+
## Other
|
| 340 |
+
|
| 341 |
+
### What terms of use apply to AlphaGenome outputs?
|
| 342 |
+
|
| 343 |
+
The AlphaGenome API is provided for non-commercial use only and is subject to
|
| 344 |
+
the AlphaGenome
|
| 345 |
+
[Terms of Service](https://deepmind.google.com/science/alphagenome/terms).
|
| 346 |
+
Outputs generated by AlphaGenome should not be used for the training of other
|
| 347 |
+
machine learning models.
|
| 348 |
+
|
| 349 |
+
### How should I cite AlphaGenome?
|
| 350 |
+
|
| 351 |
+
If you use AlphaGenome in your research, please cite using:
|
| 352 |
+
|
| 353 |
+
<!-- disableFinding(SNIPPET_INVALID_LANGUAGE) -->
|
| 354 |
+
|
| 355 |
+
```bibtex
|
| 356 |
+
@article{alphagenome,
|
| 357 |
+
title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
|
| 358 |
+
author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
|
| 359 |
+
year={2025},
|
| 360 |
+
doi={https://doi.org/10.1101/2025.06.25.661532},
|
| 361 |
+
publisher={Cold Spring Harbor Laboratory},
|
| 362 |
+
journal={bioRxiv}
|
| 363 |
+
}
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
<!-- enableFinding(SNIPPET_INVALID_LANGUAGE) -->
|
| 367 |
+
|
| 368 |
+
### Who should I contact with issues, enquiries and feedback?
|
| 369 |
+
|
| 370 |
+
Submit bugs and any code-related issues on
|
| 371 |
+
[GitHub](https://github.com/google-deepmind/alphagenome). For general feedback,
|
| 372 |
+
questions about usage, and/or feature requests, please use the
|
| 373 |
+
[community forum](https://www.alphagenomecommunity.com) – it's actively
|
| 374 |
+
monitored by our team so you're likely to find answers and insights faster. If
|
| 375 |
+
you can't find what you're looking for, please get in touch with the AlphaGenome
|
| 376 |
+
team at <alphagenome@google.com> and we will be happy to assist you with
|
| 377 |
+
questions. We're working hard to answer all inquiries but there may be a short
|
| 378 |
+
delay in our response due to the high volume we are receiving.
|
alphagenome/source/docs/source/index.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Exploring the genome with AlphaGenome
|
| 2 |
+
|
| 3 |
+
This API provides access to AlphaGenome, Google DeepMind’s unifying model for
|
| 4 |
+
deciphering the regulatory code within DNA sequences.
|
| 5 |
+
|
| 6 |
+
AlphaGenome offers multimodal predictions, encompassing diverse functional
|
| 7 |
+
outputs such as gene expression, splicing patterns, chromatin features, and
|
| 8 |
+
contact maps (see diagram below). The model analyzes DNA sequences of up to 1
|
| 9 |
+
million base pairs in length and can deliver predictions at single base-pair
|
| 10 |
+
resolution for most outputs. AlphaGenome achieves state-of-the-art performance
|
| 11 |
+
across a range of genomic prediction benchmarks, including numerous diverse
|
| 12 |
+
variant effect prediction tasks (detailed in {cite:p}`alphagenome`).
|
| 13 |
+
|
| 14 |
+
The API is offered as a free service for
|
| 15 |
+
[non-commercial use](https://deepmind.google.com/science/alphagenome/terms).
|
| 16 |
+
Query rates vary based on demand – it is well suited for smaller to medium-scale
|
| 17 |
+
analyses such as analysing a limited number of genomic regions or variants
|
| 18 |
+
requiring 1000s of predictions, but is likely not suitable for large scale
|
| 19 |
+
analyses requiring more than 1 million predictions.
|
| 20 |
+
|
| 21 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 22 |
+
```{figure} /_static/model_overview.png
|
| 23 |
+
:width: 600px
|
| 24 |
+
:alt: overview of AlphaGenome
|
| 25 |
+
:name: overview-figure
|
| 26 |
+
```
|
| 27 |
+
<!-- mdformat on -->
|
| 28 |
+
|
| 29 |
+
## Getting started
|
| 30 |
+
|
| 31 |
+
You can get started by
|
| 32 |
+
[getting an API key](https://deepmind.google.com/science/alphagenome), and
|
| 33 |
+
following our [Quick Start Guide](./colabs/quick_start.ipynb), or watching our
|
| 34 |
+
[AlphaGenome 101 tutorial](https://youtu.be/Xbvloe13nak). Please also check out
|
| 35 |
+
our installation guide, tutorials with comprehensive overviews of plotting,
|
| 36 |
+
variant scoring and other use cases, and our API reference documentation.
|
| 37 |
+
|
| 38 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 39 |
+
::::{grid} 1 1 2 3
|
| 40 |
+
:gutter: 2
|
| 41 |
+
|
| 42 |
+
:::{grid-item-card}
|
| 43 |
+
:link: installation
|
| 44 |
+
:link-type: doc
|
| 45 |
+
Installation
|
| 46 |
+
^^^^^^^^^^^^
|
| 47 |
+
|
| 48 |
+
Install `alphagenome` locally.
|
| 49 |
+
:::
|
| 50 |
+
|
| 51 |
+
:::{grid-item-card}
|
| 52 |
+
:link: tutorials/index
|
| 53 |
+
:link-type: doc
|
| 54 |
+
Tutorials
|
| 55 |
+
^^^^^^^^^
|
| 56 |
+
The tutorials walk through example usage of the AlphaGenome model.
|
| 57 |
+
:::
|
| 58 |
+
|
| 59 |
+
:::{grid-item-card}
|
| 60 |
+
:link: api/index
|
| 61 |
+
:link-type: doc
|
| 62 |
+
API reference
|
| 63 |
+
^^^^^^^^^^^^^
|
| 64 |
+
|
| 65 |
+
Reference documentation for the `alphagenome` package.
|
| 66 |
+
:::
|
| 67 |
+
::::
|
| 68 |
+
<!-- mdformat on -->
|
| 69 |
+
|
| 70 |
+
``` {toctree}
|
| 71 |
+
:maxdepth: 2
|
| 72 |
+
:hidden: False
|
| 73 |
+
|
| 74 |
+
../colabs/quick_start
|
| 75 |
+
installation
|
| 76 |
+
api/index
|
| 77 |
+
tutorials/index
|
| 78 |
+
user_guides/index
|
| 79 |
+
faqs
|
| 80 |
+
Community <https://www.alphagenomecommunity.com>
|
| 81 |
+
references
|
| 82 |
+
```
|
alphagenome/source/docs/source/installation.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Installation
|
| 2 |
+
|
| 3 |
+
The easiest way to install AlphaGenome is via the published
|
| 4 |
+
[PyPi package](https://pypi.org/project/alphagenome).
|
| 5 |
+
|
| 6 |
+
```bash
|
| 7 |
+
$ pip install -U alphagenome
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
This will install the latest version of the `alphagenome` package.
|
| 11 |
+
|
| 12 |
+
You may optionally wish to create a
|
| 13 |
+
[Python Virtual Environment](https://docs.python.org/3/tutorial/venv.html) to
|
| 14 |
+
prevent conflicts with your system's Python environment.
|
| 15 |
+
|
| 16 |
+
## Google Colab
|
| 17 |
+
|
| 18 |
+
The tutorial notebooks include a cell with the commands necessary to install
|
| 19 |
+
`alphagenome` into a colab runtime.
|
| 20 |
+
|
| 21 |
+
### Add API key to secrets
|
| 22 |
+
|
| 23 |
+
To make model requests using the tutorial notebooks, you need to add the
|
| 24 |
+
AlphaGenome API key to Colab secrets:
|
| 25 |
+
|
| 26 |
+
1. Open your Google Colab notebook and click on the 🔑 **Secrets** tab in the
|
| 27 |
+
left panel.
|
| 28 |
+
1. Create a new secret with the name `ALPHA_GENOME_API_KEY`.
|
| 29 |
+
1. Copy/paste your API key into the `Value` input box of
|
| 30 |
+
`ALPHA_GENOME_API_KEY`.
|
| 31 |
+
1. Toggle the button on the left to allow notebook access to the secret.
|
| 32 |
+
|
| 33 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 34 |
+
|
| 35 |
+
```{figure} /_static/secrets.png
|
| 36 |
+
:width: 600px
|
| 37 |
+
:alt: Image of secrets tab found on left panel.
|
| 38 |
+
:name: secrets-screenshot
|
| 39 |
+
```
|
| 40 |
+
<!-- mdformat on -->
|
| 41 |
+
|
| 42 |
+
## Running locally
|
| 43 |
+
|
| 44 |
+
To install a local copy of `alphagenome`, clone a local copy of the repository
|
| 45 |
+
and run `pip install`:
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
$ rm -rf ./alphagenome
|
| 49 |
+
$ git clone https://github.com/google-deepmind/alphagenome.git
|
| 50 |
+
$ pip install -e ./alphagenome
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
We strongly recommend using a virtual environment management system such as
|
| 54 |
+
[miniconda](https://docs.anaconda.com/miniconda/) or
|
| 55 |
+
[uv](https://docs.astral.sh/uv/pip/environments/).
|
| 56 |
+
|
| 57 |
+
In the case of miniconda, installation would be achieved with the following:
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
conda create -n alphagenome-env python=3.11
|
| 61 |
+
conda activate alphagenome-env
|
| 62 |
+
pip install -e ./alphagenome
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Updating `alphagenome`
|
| 66 |
+
|
| 67 |
+
Assuming the relevant virtual environment is already activated:
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
cd ./alphagenome
|
| 71 |
+
git pull
|
| 72 |
+
pip install --upgrade .
|
| 73 |
+
```
|
alphagenome/source/docs/source/references.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# References
|
| 2 |
+
|
| 3 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 4 |
+
```{bibliography}
|
| 5 |
+
:cited:
|
| 6 |
+
```
|
| 7 |
+
<!-- mdformat on -->
|
alphagenome/source/docs/source/refs.bib
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@article{alphagenome,
|
| 2 |
+
title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
|
| 3 |
+
author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
|
| 4 |
+
year={2025},
|
| 5 |
+
doi={https://doi.org/10.1101/2025.06.25.661532},
|
| 6 |
+
publisher={Cold Spring Harbor Laboratory},
|
| 7 |
+
journal={bioRxiv}
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
@article{gtex2020gtex,
|
| 11 |
+
title={The GTEx Consortium atlas of genetic regulatory effects across human tissues},
|
| 12 |
+
author={GTEx Consortium},
|
| 13 |
+
journal={Science},
|
| 14 |
+
volume={369},
|
| 15 |
+
number={6509},
|
| 16 |
+
pages={1318--1330},
|
| 17 |
+
year={2020},
|
| 18 |
+
publisher={American Association for the Advancement of Science}
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
@article{zhou2022sequence,
|
| 22 |
+
title={Sequence-based modeling of three-dimensional genome architecture from kilobase to chromosome scale},
|
| 23 |
+
author={Zhou, Jian},
|
| 24 |
+
journal={Nature genetics},
|
| 25 |
+
volume={54},
|
| 26 |
+
number={5},
|
| 27 |
+
pages={725--734},
|
| 28 |
+
year={2022},
|
| 29 |
+
publisher={Nature Publishing Group US New York}
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
@article{borzoi,
|
| 33 |
+
title={Predicting RNA-seq coverage from DNA sequence as a unifying model of gene regulation},
|
| 34 |
+
author={Linder, Johannes and Srivastava, Divyanshi and Yuan, Han and Agarwal, Vikram and Kelley, David R},
|
| 35 |
+
journal={Nature Genetics},
|
| 36 |
+
pages={1--13},
|
| 37 |
+
year={2025},
|
| 38 |
+
publisher={Nature Publishing Group US New York}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
@article{spliceai,
|
| 42 |
+
title={Predicting splicing from primary sequence with deep learning},
|
| 43 |
+
author={Jaganathan, Kishore and Panagiotopoulou, Sofia Kyriazopoulou and McRae, Jeremy F and Darbandi, Siavash Fazel and Knowles, David and Li, Yang I and Kosmicki, Jack A and Arbelaez, Juan and Cui, Wenwu and Schwartz, Grace B and others},
|
| 44 |
+
journal={Cell},
|
| 45 |
+
volume={176},
|
| 46 |
+
number={3},
|
| 47 |
+
pages={535--548},
|
| 48 |
+
year={2019},
|
| 49 |
+
publisher={Elsevier}
|
| 50 |
+
}
|
alphagenome/source/docs/source/tutorials/index.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tutorials
|
| 2 |
+
|
| 3 |
+
``` {toctree}
|
| 4 |
+
:maxdepth: 1
|
| 5 |
+
:hidden:
|
| 6 |
+
|
| 7 |
+
../colabs/visualization_modality_tour
|
| 8 |
+
../colabs/variant_scoring_ui
|
| 9 |
+
../colabs/tissue_ontology_mapping
|
| 10 |
+
../colabs/batch_variant_scoring
|
| 11 |
+
../colabs/example_analysis_workflow
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
<!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
|
| 15 |
+
|
| 16 |
+
::::{grid} 1 1 2 3
|
| 17 |
+
:gutter: 2
|
| 18 |
+
|
| 19 |
+
:::{grid-item-card} Visualizing predictions
|
| 20 |
+
:link: ../colabs/visualization_modality_tour
|
| 21 |
+
:link-type: doc
|
| 22 |
+
|
| 23 |
+
How to visualize different output modalities.
|
| 24 |
+
:::
|
| 25 |
+
|
| 26 |
+
:::{grid-item-card} Scoring and visualizing a single variant
|
| 27 |
+
:link: ../colabs/variant_scoring_ui
|
| 28 |
+
:link-type: doc
|
| 29 |
+
|
| 30 |
+
Tool for scoring and visualizing a single variant across multiple modalities.
|
| 31 |
+
:::
|
| 32 |
+
|
| 33 |
+
:::{grid-item-card} Navigating data ontologies
|
| 34 |
+
:link: ../colabs/tissue_ontology_mapping
|
| 35 |
+
:link-type: doc
|
| 36 |
+
|
| 37 |
+
Tool for fetching ontology IDs for a given tissue.
|
| 38 |
+
|
| 39 |
+
:::
|
| 40 |
+
|
| 41 |
+
:::{grid-item-card} Batch variant scoring
|
| 42 |
+
:link: ../colabs/batch_variant_scoring
|
| 43 |
+
:link-type: doc
|
| 44 |
+
|
| 45 |
+
Tool for scoring many variants at once.
|
| 46 |
+
:::
|
| 47 |
+
|
| 48 |
+
:::{grid-item-card} Example analysis workflow
|
| 49 |
+
:link: ../colabs/example_analysis_workflow
|
| 50 |
+
:link-type: doc
|
| 51 |
+
|
| 52 |
+
Example analysis of TAL1 locus.
|
| 53 |
+
:::
|
| 54 |
+
|
| 55 |
+
<!-- mdformat on -->
|
alphagenome/source/docs/source/user_guides/index.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# User guides
|
| 2 |
+
|
| 3 |
+
``` {toctree}
|
| 4 |
+
:maxdepth: 1
|
| 5 |
+
:hidden:
|
| 6 |
+
|
| 7 |
+
../colabs/essential_commands
|
| 8 |
+
../exploring_model_metadata
|
| 9 |
+
../variant_scoring
|
| 10 |
+
../visualization_library_basics
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
<!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
|
| 14 |
+
|
| 15 |
+
::::{grid} 1 1 2 3
|
| 16 |
+
:gutter: 2
|
| 17 |
+
|
| 18 |
+
:::{grid-item-card} Essential commands
|
| 19 |
+
:link: ../colabs/essential_commands
|
| 20 |
+
:link-type: doc
|
| 21 |
+
|
| 22 |
+
Essential commands for navigating AlphaGenome.
|
| 23 |
+
:::
|
| 24 |
+
|
| 25 |
+
:::{grid-item-card} Model output metadata
|
| 26 |
+
:link: ../exploring_model_metadata
|
| 27 |
+
:link-type: doc
|
| 28 |
+
|
| 29 |
+
A summary of model outputs and associated metadata.
|
| 30 |
+
:::
|
| 31 |
+
|
| 32 |
+
:::{grid-item-card} Variant scoring
|
| 33 |
+
:link: ../variant_scoring
|
| 34 |
+
:link-type: doc
|
| 35 |
+
|
| 36 |
+
Overview of how variant scores are calculated.
|
| 37 |
+
:::
|
| 38 |
+
|
| 39 |
+
:::{grid-item-card} Visualization basics
|
| 40 |
+
:link: ../visualization_library_basics
|
| 41 |
+
:link-type: doc
|
| 42 |
+
|
| 43 |
+
Guide to visualization tools.
|
| 44 |
+
:::
|
| 45 |
+
|
| 46 |
+
<!-- mdformat on -->
|
alphagenome/source/docs/source/variant_scoring.md
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# How variant scoring works
|
| 2 |
+
|
| 3 |
+
A genomic variant is a difference identified in an individual's genome sequence
|
| 4 |
+
when compared to the reference genome sequence. Many genomic variants likely
|
| 5 |
+
have no appreciable impact, but it can be challenging to identify those that do
|
| 6 |
+
have a particular molecular effect. AlphaGenome predictions can be used to score
|
| 7 |
+
variants and help bridge this gap.
|
| 8 |
+
|
| 9 |
+
To do so, the variant is treated as a pair of sequences: reference (`REF`) and
|
| 10 |
+
alternate (`ALT`). The variant effect is estimated by comparing AlphaGenome
|
| 11 |
+
predictions for these two sequences across different modalities returned by the
|
| 12 |
+
model.
|
| 13 |
+
|
| 14 |
+
## Detailed steps
|
| 15 |
+
|
| 16 |
+
Variant scoring is implemented as follows:
|
| 17 |
+
|
| 18 |
+
### Make `REF` and `ALT` predictions for given modality
|
| 19 |
+
|
| 20 |
+
Variant scoring begins by generating predictions for both the reference and
|
| 21 |
+
alternative alleles of a variant, restricted to a given modality of interest
|
| 22 |
+
(ex: `RNA-SEQ`, `ATAC`, etc.).
|
| 23 |
+
|
| 24 |
+
The model input at this stage are `REF` and `ALT` sequences, whose sequence
|
| 25 |
+
interval contains the variant of interest.
|
| 26 |
+
|
| 27 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 28 |
+
|
| 29 |
+
```{figure} /_static/variant_scoring_ref_alt.png
|
| 30 |
+
:width: 500px
|
| 31 |
+
:alt: Make `REF` and `ALT` predictions for given modality.
|
| 32 |
+
:name: variant-scoring-1
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
<!-- mdformat on -->
|
| 36 |
+
|
| 37 |
+
### Optional - perform indel alignment
|
| 38 |
+
|
| 39 |
+
For insertion or deletion (indel) variants, the `ALT` allele's prediction
|
| 40 |
+
profile is aligned to the `REF` allele's coordinate space. Inserted bases are
|
| 41 |
+
summarized by taking the maximum value over the inserted segment, while deleted
|
| 42 |
+
bases are treated as having zero signal in the `ALT` context, thereby enabling
|
| 43 |
+
consistent positional comparisons.
|
| 44 |
+
|
| 45 |
+
### Apply spatial mask
|
| 46 |
+
|
| 47 |
+
A spatial mask defines regions of interest within the interval containing the
|
| 48 |
+
variant. This mask can be centered on the variant or encompass a gene (gene
|
| 49 |
+
body, exons, or TSS, based on annotations from a GTF file).
|
| 50 |
+
|
| 51 |
+
At this stage, values outside of the mask are discarded.
|
| 52 |
+
|
| 53 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 54 |
+
|
| 55 |
+
```{figure} /_static/variant_scoring_spatial_mask.png
|
| 56 |
+
:width: 500px
|
| 57 |
+
:alt: Apply spatial mask.
|
| 58 |
+
:name: variant-scoring-2
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
<!-- mdformat on -->
|
| 62 |
+
|
| 63 |
+
### Aggregate spatially and compute `ALT - REF`
|
| 64 |
+
|
| 65 |
+
Aggregation occurs at this stage, which includes the following:
|
| 66 |
+
|
| 67 |
+
* reduction along the spatial axis, using `mean` or `sum`, etc.
|
| 68 |
+
* (optional) scaling, such as a $log$ or $l^2$ transform.
|
| 69 |
+
* difference between `ALT - REF`.
|
| 70 |
+
|
| 71 |
+
The final outcome is a single scalar value per track.
|
| 72 |
+
|
| 73 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 74 |
+
|
| 75 |
+
```{figure} /_static/variant_scoring_spatial_compute.png
|
| 76 |
+
:width: 500px
|
| 77 |
+
:alt: Aggregate spatially and compute `ALT - REF`.
|
| 78 |
+
:name: variant-scoring-3
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
<!-- mdformat on -->
|
| 82 |
+
|
| 83 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 84 |
+
|
| 85 |
+
```{note}
|
| 86 |
+
Aggregation logic is encapsulated in the options listed in
|
| 87 |
+
{class}`~alphagenome.models.variant_scorers.AggregationType`.
|
| 88 |
+
|
| 89 |
+
The naming of the options reflects the order of operations of each of the above
|
| 90 |
+
steps, with the right-most operation applied first to the model predictions.
|
| 91 |
+
|
| 92 |
+
For example,
|
| 93 |
+
{class}`~alphagenome.models.variant_scorers.AggregationType.DIFF_SUM_LOG2`,
|
| 94 |
+
applies a log transform, then a sum, to track data. It then returns the
|
| 95 |
+
difference between `ALT - REF`.
|
| 96 |
+
|
| 97 |
+
Some aggregation options may apply the exact same steps, but in a different order.
|
| 98 |
+
|
| 99 |
+
Regardless of the order of operations, each aggregation type returns one single
|
| 100 |
+
scalar value per track.
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
<!-- mdformat on -->
|
| 104 |
+
|
| 105 |
+
### Optional - aggregate tracks
|
| 106 |
+
|
| 107 |
+
After variant scoring is completed, optional track selection and additional
|
| 108 |
+
aggregation can be applied.
|
| 109 |
+
|
| 110 |
+
Suggestions include additional aggregation (mean, max, sum, etc.) over:
|
| 111 |
+
|
| 112 |
+
* All tracks
|
| 113 |
+
* Subsets of tracks
|
| 114 |
+
|
| 115 |
+
Or, a single track of interest can be chosen, i.e., from a particular sample.
|
| 116 |
+
|
| 117 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 118 |
+
|
| 119 |
+
```{figure} /_static/variant_scoring_aggregate.png
|
| 120 |
+
:width: 500px
|
| 121 |
+
:alt: Optional - aggregate tracks.
|
| 122 |
+
:name: variant-scoring-4
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
<!-- mdformat on -->
|
| 126 |
+
|
| 127 |
+
## Modality-specific recommended variant scorers
|
| 128 |
+
|
| 129 |
+
We have established a set of recommended variant scorers, available via
|
| 130 |
+
{func}`~alphagenome.models.variant_scorers.get_recommended_scorers`, covering
|
| 131 |
+
diverse genomic modalities as outlined below:
|
| 132 |
+
|
| 133 |
+
### Gene Expression (RNA-seq)
|
| 134 |
+
|
| 135 |
+
Variant scores quantify the impact on overall gene transcript abundance.
|
| 136 |
+
|
| 137 |
+
* comparison: predicted RNA coverage between `REF` and `ALT` alleles
|
| 138 |
+
* mask: exons for a gene of interest
|
| 139 |
+
* aggregation: Log-fold change of gene expression level between the `ALT` and
|
| 140 |
+
`REF` alleles: {math}`\log(mean(ALT) + 0.001) - log(mean(REF) + 0.001)`
|
| 141 |
+
|
| 142 |
+
### Polyadenylation Site (PAS) Usage
|
| 143 |
+
|
| 144 |
+
This follows Borzoi's {cite:p}`borzoi` methodology for scoring polyadenylation
|
| 145 |
+
quantitative trait loci (paQTLs), which captures the variant's impact on RNA
|
| 146 |
+
isoform production.
|
| 147 |
+
|
| 148 |
+
* comparison: predicted RNA coverage between `REF` and `ALT` alleles
|
| 149 |
+
* mask: local 400-bp windows around 3' cleavage junctions
|
| 150 |
+
* aggregation: Maximum absolute log-fold change of isoform ratios
|
| 151 |
+
(distal/proximal PAS usage) between `REF` and `ALT`, considering all
|
| 152 |
+
proximal/distal splits.
|
| 153 |
+
|
| 154 |
+
### TSS Activity (CAGE, PRO-cap)
|
| 155 |
+
|
| 156 |
+
Variant scores quantify local changes at TSSs.
|
| 157 |
+
|
| 158 |
+
* comparison: predicted CAGE or PRO-cap coverage between `REF` and `ALT`
|
| 159 |
+
alleles
|
| 160 |
+
* mask: local 501-bp window centered at the variant
|
| 161 |
+
* aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
|
| 162 |
+
(sum(REF) + 1)]`
|
| 163 |
+
|
| 164 |
+
### Chromatin Accessibility (ATAC-seq, DNase-seq)
|
| 165 |
+
|
| 166 |
+
Variant scores quantify local accessibility changes.
|
| 167 |
+
|
| 168 |
+
* comparison: predicted ATAC-seq or DNase-cap coverage between `REF` and `ALT`
|
| 169 |
+
alleles
|
| 170 |
+
* mask: local 501-bp window centered at the variant
|
| 171 |
+
* aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
|
| 172 |
+
(sum(REF) + 1)]`
|
| 173 |
+
|
| 174 |
+
### Transcription Factor Binding (ChIP-TF)
|
| 175 |
+
|
| 176 |
+
Variant scores quantify changes in TF binding intensity.
|
| 177 |
+
|
| 178 |
+
* comparison: predicted ChIP-TF coverage between `REF` and `ALT` alleles
|
| 179 |
+
* mask: local 501-bp window centered at the variant
|
| 180 |
+
* aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
|
| 181 |
+
(sum(REF) + 1)]`
|
| 182 |
+
|
| 183 |
+
### Histone Modifications (ChIP-Histone)
|
| 184 |
+
|
| 185 |
+
Variant scores quantify changes in histone modifications.
|
| 186 |
+
|
| 187 |
+
* comparison: predicted ChIP-Histone coverage between `REF` and `ALT` alleles
|
| 188 |
+
* mask: local 2001-bp window centered at the variant
|
| 189 |
+
* aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
|
| 190 |
+
(sum(REF) + 1)]`
|
| 191 |
+
|
| 192 |
+
### Splicing (Splice Sites)
|
| 193 |
+
|
| 194 |
+
Variant scores quantify changes in the class assignment probabilities (acceptor,
|
| 195 |
+
donor) at all potential splice sites within a gene body.
|
| 196 |
+
|
| 197 |
+
* comparison: class assignment probabilities for `REF` and `ALT` alleles
|
| 198 |
+
* mask: gene body for a gene of interest
|
| 199 |
+
* aggregation: Maximum absolute difference of predicted splice site
|
| 200 |
+
probabilities across the gene body: {math}`max(|ALT - REF|)`
|
| 201 |
+
|
| 202 |
+
### Splicing (Splice Site Usage)
|
| 203 |
+
|
| 204 |
+
Variant scores quantify changes in the usage of splice sites (i.e., increased or
|
| 205 |
+
decreased fractions).
|
| 206 |
+
|
| 207 |
+
* comparison: predicted splice site usage between `REF` and `ALT` alleles
|
| 208 |
+
* mask: gene body for a gene of interest
|
| 209 |
+
* aggregation: Maximum absolute difference of predicted splice site usage
|
| 210 |
+
across the gene body: {math}`max(|ALT - REF|)`
|
| 211 |
+
|
| 212 |
+
### Splicing (Splice Junctions)
|
| 213 |
+
|
| 214 |
+
Variant scores quantify changes in the predicted RNA-seq reads spanning a
|
| 215 |
+
junction, which is a function of both expression level, splice site usage and
|
| 216 |
+
splicing efficiency.
|
| 217 |
+
|
| 218 |
+
* comparison: predicted paired junction counts between `REF` and `ALT` alleles
|
| 219 |
+
* mask: top-k splice sites for a gene of interest (including annotated and
|
| 220 |
+
predicted splice sites)
|
| 221 |
+
* aggregation: Maximum absolute log-fold change of predicted junction counts
|
| 222 |
+
across splice site pairs of interest: {math}`max(|log(ALT) - log(REF)|)`
|
| 223 |
+
|
| 224 |
+
### 3D Genome Contact (Contact Maps)
|
| 225 |
+
|
| 226 |
+
Variant scores quantify local contact disruption.
|
| 227 |
+
|
| 228 |
+
* comparison: predicted contact frequencies between `REF` and `ALT` alleles
|
| 229 |
+
* mask: local 1MB window centered at the variant
|
| 230 |
+
* aggregation: Mean absolute difference of contact frequencies, for all
|
| 231 |
+
interactions involving the variant-containing bin.
|
| 232 |
+
|
| 233 |
+
### Active Allele Scorers
|
| 234 |
+
|
| 235 |
+
In addition to the differential scores described above, we also provide scoring
|
| 236 |
+
configurations that capture the absolute activity level associated with one of
|
| 237 |
+
the alleles, rather than quantifying the change between `REF` and `ALT`. This is
|
| 238 |
+
calculated by taking the maximum of the aggregated signals from the `REF` and
|
| 239 |
+
`ALT` alleles over the masked central window or gene region.
|
| 240 |
+
|
| 241 |
+
We provide recommended active allele scorers for the following modalities:
|
| 242 |
+
|
| 243 |
+
* Gene expression (RNA-seq): {math}`max(mean(ALT), mean(REF))` across exons
|
| 244 |
+
for a gene of interest
|
| 245 |
+
* TSS activity (CAGE, PRO-cap): {math}`max(sum(ALT), sum(REF))` within a local
|
| 246 |
+
501-bp window centered at the variant
|
| 247 |
+
* Chromatin Accessibility (ATAC-seq, DNase-seq): {math}`max(sum(ALT),
|
| 248 |
+
sum(REF))` within a local 501-bp window centered at the variant
|
| 249 |
+
* Transcription Factor binding (ChIP-TF): {math}`max(sum(ALT), sum(REF))`
|
| 250 |
+
within a local 501-bp window centered at the variant
|
| 251 |
+
* Histone modifications (ChIP-Histone): {math}`max(sum(ALT), sum(REF))` within
|
| 252 |
+
a local 2001-bp window centered at the variant
|
| 253 |
+
|
| 254 |
+
## Available variant scorers
|
| 255 |
+
|
| 256 |
+
For more on the types of variant scorers and how they work, visit the
|
| 257 |
+
[API documentation](api/models.md#variant-scorers).
|
alphagenome/source/docs/source/visualization_library_basics.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Visualization basics
|
| 2 |
+
|
| 3 |
+
<!-- disableFinding(LINK_ID) -->
|
| 4 |
+
|
| 5 |
+
AlphaGenome predicts a variety of output types with different data shapes and
|
| 6 |
+
biological interpretations ([table](#viz-table)). We provide
|
| 7 |
+
[`alphagenome.visualization`](project:api/visualization.md) to generate
|
| 8 |
+
matplotlib figures from model API outputs, which we outline here.
|
| 9 |
+
|
| 10 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 11 |
+
|
| 12 |
+
```{tip}
|
| 13 |
+
See the {doc}`visualizing predictions tutorial </colabs/visualization_modality_tour>`
|
| 14 |
+
for worked examples of plotting different modalities.
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
<!-- mdformat on -->
|
| 18 |
+
|
| 19 |
+
## Plot
|
| 20 |
+
|
| 21 |
+
The key function, {func}`~alphagenome.visualization.plot_components.plot`, takes
|
| 22 |
+
as input a list of components and returns a {class}`matplotlib.figure.Figure`.
|
| 23 |
+
|
| 24 |
+
## Components
|
| 25 |
+
|
| 26 |
+
A component is a light wrapper around a model output (such as predicted genomic
|
| 27 |
+
tracks, splice junctions, etc) and specifies plot aesthetics. Each component
|
| 28 |
+
maps to one vertically stacked subplot in the final figure (see blue text in the
|
| 29 |
+
[figure](#viz-figure)). Each component has an independent y-axis but shares a
|
| 30 |
+
common x-axis, corresponding to the length of the DNA interval, in base pairs
|
| 31 |
+
(bp).
|
| 32 |
+
|
| 33 |
+
Several default components are available, each designed to best visually
|
| 34 |
+
represent different modalities and data shapes returned by the model API (see
|
| 35 |
+
[table](#viz-table)).
|
| 36 |
+
|
| 37 |
+
## Annotations
|
| 38 |
+
|
| 39 |
+
Additional figure elements specific to the DNA interval, but outside of
|
| 40 |
+
components -- such as locations of promoters or variants -- can be overlaid via
|
| 41 |
+
a list of annotations that are passed to
|
| 42 |
+
{func}`~alphagenome.visualization.plot_components.plot`.
|
| 43 |
+
|
| 44 |
+
## Custom plotting
|
| 45 |
+
|
| 46 |
+
For users interested in configuring novel components, extend the
|
| 47 |
+
{func}`~alphagenome.visualization.plot_components.AbstractComponent` and
|
| 48 |
+
{func}`~alphagenome.visualization.plot_components.AbstractAnnotation` base
|
| 49 |
+
classes.
|
| 50 |
+
|
| 51 |
+
Any other data supplied by the user can be visualized using this library as is,
|
| 52 |
+
as long as it is provided to
|
| 53 |
+
[`plot_components`](project:api/visualization.md#plot-components) in the format
|
| 54 |
+
required e.g. {class}`~alphagenome.data.track_data.TrackData` for
|
| 55 |
+
{class}`~alphagenome.visualization.plot_components.Tracks`.
|
| 56 |
+
|
| 57 |
+
<!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
|
| 58 |
+
|
| 59 |
+
```{figure} /_static/visualization_overview.png
|
| 60 |
+
:height: 600px
|
| 61 |
+
:alt: visualization library description/overview
|
| 62 |
+
:name: viz-figure
|
| 63 |
+
|
| 64 |
+
Illustrative diagram of visualization library. Blue text indicates
|
| 65 |
+
[`plot_components`](<project:api/visualization.md#plot-components>) classes, and purple text indicates arguments to
|
| 66 |
+
[`plot_components`](<project:api/visualization.md#plot-components>) that adjust figure-wide aesthetics
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
```{list-table} Plotting components and annotation classes.
|
| 70 |
+
:widths: 10 30 10 10 30 10
|
| 71 |
+
:header-rows: 1
|
| 72 |
+
:name: viz-table
|
| 73 |
+
|
| 74 |
+
* - Component name plot\_components.\*
|
| 75 |
+
- Description
|
| 76 |
+
- Example figure
|
| 77 |
+
- Data shape supported
|
| 78 |
+
- Recommended model outputs
|
| 79 |
+
- Good for visualising variants?
|
| 80 |
+
* - {class}`~alphagenome.visualization.plot_components.Tracks`
|
| 81 |
+
- A line-plot visualizing a scalar value at each genomic position (or
|
| 82 |
+
coarser resolution) e.g. predictions of RNA\_SEQ for a specific
|
| 83 |
+
- Colab cell
|
| 84 |
+
- 1D
|
| 85 |
+
- All except SPLICE\_JUNCTIONS; CONTACT\_MAPS
|
| 86 |
+
- No
|
| 87 |
+
* - {class}`~alphagenome.visualization.plot_components.OverlaidTracks`
|
| 88 |
+
- A line-plot as for Tracks, but with two separate lines on the same axis
|
| 89 |
+
with different colors e.g. predictions of RNA\_SEQ for the Reference and
|
| 90 |
+
Alternative sequence defined by a variant.
|
| 91 |
+
- Colab cell
|
| 92 |
+
- 1D x 2
|
| 93 |
+
- All except SPLICE\_JUNCTIONS; CONTACT\_MAPS
|
| 94 |
+
- Yes
|
| 95 |
+
* - {class}`~alphagenome.visualization.plot_components.Sashimi`
|
| 96 |
+
- A series of arcs, each representing a scalar value for a pair of genomic
|
| 97 |
+
positions (e.g. splice junctions). The thickness of the arcs are
|
| 98 |
+
determined by the relative sizes of the scalars.
|
| 99 |
+
- Colab cell
|
| 100 |
+
- 2D (sparse)
|
| 101 |
+
- SPLICE\_JUNCTIONS
|
| 102 |
+
- Yes
|
| 103 |
+
* - {class}`~alphagenome.visualization.plot_components.SeqLogo`
|
| 104 |
+
- A sequence of letters (bases) with heights corresponding to a single
|
| 105 |
+
scalar value per genomic position (e.g. from contribution scores).
|
| 106 |
+
- Colab cell
|
| 107 |
+
- 1D \+ sequence
|
| 108 |
+
- ISM contribution scores
|
| 109 |
+
- Yes
|
| 110 |
+
* - {class}`~alphagenome.visualization.plot_components.ContactMaps`
|
| 111 |
+
- A heatmap visualizing a matrix of scalars (e.g. predicted DNA-DNA
|
| 112 |
+
contacts), one for each pair of genomic positions in an interval.
|
| 113 |
+
- Colab cell
|
| 114 |
+
- 2D
|
| 115 |
+
- CONTACT\_MAPS
|
| 116 |
+
- No
|
| 117 |
+
* - {class}`~alphagenome.visualization.plot_components.ContactMapsDiff`
|
| 118 |
+
- A heatmap as for ContactMaps, but with a diverging color map centered on
|
| 119 |
+
zero (white) to represent values derived from differences (e.g. ALT \-
|
| 120 |
+
REF)
|
| 121 |
+
- Colab cell
|
| 122 |
+
- 2D
|
| 123 |
+
- CONTACT\_MAPS
|
| 124 |
+
- Yes
|
| 125 |
+
* - {class}`~alphagenome.visualization.plot_components.TranscriptAnnotation`
|
| 126 |
+
- Horizontal lines representing locations of transcripts. Exons, introns,
|
| 127 |
+
untranslated regions, and direction of transcription are indicated by
|
| 128 |
+
differences in line thickness.
|
| 129 |
+
- Colab cell
|
| 130 |
+
- Interval(s)
|
| 131 |
+
- N/A
|
| 132 |
+
- No
|
| 133 |
+
* - {class}`~alphagenome.visualization.plot_components.VariantAnnotation`
|
| 134 |
+
- A semi-transparent rectangle (or vertical line if a variant) spanning
|
| 135 |
+
all plot components, indicating the location of an interval (or
|
| 136 |
+
variant). The interval (variant) is optionally labeled.
|
| 137 |
+
- Colab cell
|
| 138 |
+
- Interval(s) or Variant(s)
|
| 139 |
+
- N/A
|
| 140 |
+
- Yes
|
| 141 |
+
* - {class}`~alphagenome.visualization.plot_components.AbstractComponent`
|
| 142 |
+
- This is an abstract class, which is the parent class of most
|
| 143 |
+
plot\_components.\*. A user can define their own component class,
|
| 144 |
+
provided it adheres to the structure specified by AbstractComponent. The
|
| 145 |
+
workhorse method is plot\_ax(), which populates a matplotlib.axes.Axes
|
| 146 |
+
object with visuals defined by the input data.
|
| 147 |
+
- N/A
|
| 148 |
+
- N/A
|
| 149 |
+
- N/A
|
| 150 |
+
- N/A
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
<!-- mdformat on -->
|
alphagenome/source/hatch_build.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Hatch build hook to generate Python bindings for protos."""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
from typing import Any
|
| 19 |
+
from grpc_tools import protoc
|
| 20 |
+
from hatchling.builders.hooks.plugin.interface import BuildHookInterface # pylint: disable=g-importing-member
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'src')
|
| 24 |
+
|
| 25 |
+
# Tuple of proto message definitions to build Python bindings for. Paths must
|
| 26 |
+
# be relative to root directory.
|
| 27 |
+
_ALPHAGENOME_PROTOS = (
|
| 28 |
+
'alphagenome/protos/dna_model.proto',
|
| 29 |
+
'alphagenome/protos/dna_model_service.proto',
|
| 30 |
+
'alphagenome/protos/tensor.proto',
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class GenerateProtos(BuildHookInterface):
|
| 35 |
+
"""Generates Python protobuf bindings for alphagenome.protos."""
|
| 36 |
+
|
| 37 |
+
def initialize(self, version: str, build_data: dict[str, Any]) -> None:
|
| 38 |
+
del version, build_data # Unused.
|
| 39 |
+
|
| 40 |
+
for proto_path in _ALPHAGENOME_PROTOS:
|
| 41 |
+
proto_args = [
|
| 42 |
+
'grpc_tools.protoc',
|
| 43 |
+
f'--proto_path={_ROOT_DIR}',
|
| 44 |
+
f'--python_out={_ROOT_DIR}',
|
| 45 |
+
f'--grpc_python_out={_ROOT_DIR}',
|
| 46 |
+
os.path.join(_ROOT_DIR, proto_path),
|
| 47 |
+
]
|
| 48 |
+
if protoc.main(proto_args) != 0:
|
| 49 |
+
raise RuntimeError(f'ERROR: {proto_args}')
|
alphagenome/source/pyproject.toml
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
build-backend = 'hatchling.build'
|
| 3 |
+
requires = ['hatchling', 'grpcio-tools<=1.67.1', 'importlib-resources']
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
[project]
|
| 7 |
+
name = 'alphagenome'
|
| 8 |
+
description = 'A Python SDK for interacting and visualizing genomic models.'
|
| 9 |
+
readme = 'README.md'
|
| 10 |
+
dynamic = ['version']
|
| 11 |
+
license = { file = 'LICENSE' }
|
| 12 |
+
requires-python = '>=3.10'
|
| 13 |
+
authors = [
|
| 14 |
+
{name = 'Google LLC'},
|
| 15 |
+
{email = 'alphagenome@google.com'},
|
| 16 |
+
]
|
| 17 |
+
keywords = [
|
| 18 |
+
'python',
|
| 19 |
+
'machine learning',
|
| 20 |
+
'genomics'
|
| 21 |
+
]
|
| 22 |
+
classifiers=[
|
| 23 |
+
'Development Status :: 4 - Beta',
|
| 24 |
+
'Environment :: Console',
|
| 25 |
+
'Intended Audience :: Science/Research',
|
| 26 |
+
'License :: OSI Approved :: Apache Software License',
|
| 27 |
+
'Operating System :: OS Independent',
|
| 28 |
+
'Programming Language :: Python :: 3.10',
|
| 29 |
+
'Programming Language :: Python :: 3.11',
|
| 30 |
+
'Programming Language :: Python :: 3.12',
|
| 31 |
+
'Programming Language :: Python :: 3.13',
|
| 32 |
+
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
| 33 |
+
]
|
| 34 |
+
dependencies=[
|
| 35 |
+
# keep-sorted start
|
| 36 |
+
'absl-py',
|
| 37 |
+
'anndata',
|
| 38 |
+
'grpcio>=1.67.1',
|
| 39 |
+
'immutabledict',
|
| 40 |
+
'intervaltree',
|
| 41 |
+
'jaxtyping',
|
| 42 |
+
'matplotlib',
|
| 43 |
+
'ml_dtypes',
|
| 44 |
+
'numpy',
|
| 45 |
+
'pandas',
|
| 46 |
+
'protobuf>=5.28.3',
|
| 47 |
+
'pyarrow',
|
| 48 |
+
'scipy',
|
| 49 |
+
'seaborn',
|
| 50 |
+
'tqdm',
|
| 51 |
+
'typeguard',
|
| 52 |
+
'typing_extensions',
|
| 53 |
+
'zstandard',
|
| 54 |
+
# keep-sorted end
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
[project.urls]
|
| 58 |
+
Repository = 'https://github.com/google-deepmind/alphagenome'
|
| 59 |
+
Documentation = 'https://www.alphagenomedocs.com/'
|
| 60 |
+
|
| 61 |
+
[project.optional-dependencies]
|
| 62 |
+
dev = [
|
| 63 |
+
'hatch',
|
| 64 |
+
]
|
| 65 |
+
docs = [
|
| 66 |
+
'ipykernel',
|
| 67 |
+
'ipython',
|
| 68 |
+
'myst-nb',
|
| 69 |
+
'sphinx>=5.0',
|
| 70 |
+
'sphinx-autodoc-typehints',
|
| 71 |
+
'sphinx-book-theme',
|
| 72 |
+
'sphinx-copybutton',
|
| 73 |
+
'sphinx-remove-toctrees',
|
| 74 |
+
'sphinx-design',
|
| 75 |
+
'sphinxcontrib-bibtex>=1.0.0',
|
| 76 |
+
]
|
| 77 |
+
scripts = [
|
| 78 |
+
'absl-py',
|
| 79 |
+
'pyarrow',
|
| 80 |
+
'pyranges',
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
# Calls hatch_build.py to generate Python bindings for protos.
|
| 84 |
+
[tool.hatch.build.hooks.custom]
|
| 85 |
+
|
| 86 |
+
[tool.hatch.version]
|
| 87 |
+
path = 'src/alphagenome/__init__.py'
|
| 88 |
+
|
| 89 |
+
[tool.hatch.envs.default]
|
| 90 |
+
installer = 'uv'
|
| 91 |
+
|
| 92 |
+
[tool.setuptools.packages.find]
|
| 93 |
+
include = ['README.md', 'LICENSE']
|
| 94 |
+
exclude = ['*_test.py', 'examples']
|
| 95 |
+
|
| 96 |
+
[tool.hatch.envs.hatch-test]
|
| 97 |
+
default-args = []
|
| 98 |
+
extra-dependencies=['google-benchmark', 'typeguard==2.13.3']
|
| 99 |
+
parallel = true
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
[tool.hatch.envs.hatch-test.env-vars]
|
| 103 |
+
MPLBACKEND = 'agg'
|
| 104 |
+
|
| 105 |
+
[[tool.hatch.envs.hatch-test.matrix]]
|
| 106 |
+
# Use hatch test --all to run tests on all supported Python versions.
|
| 107 |
+
python = ['3.13', '3.12', '3.11', '3.10']
|
| 108 |
+
|
| 109 |
+
[tool.hatch.envs.check]
|
| 110 |
+
dependencies = [
|
| 111 |
+
'pyink>=24.3.0',
|
| 112 |
+
'pylint>=2.6.0',
|
| 113 |
+
]
|
| 114 |
+
# Do not install dependencies for the check environment.
|
| 115 |
+
detached = true
|
| 116 |
+
|
| 117 |
+
[tool.hatch.envs.check.scripts]
|
| 118 |
+
format = 'pyink . --check'
|
| 119 |
+
lint = 'pylint .'
|
| 120 |
+
all = [
|
| 121 |
+
'format',
|
| 122 |
+
'lint',
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
[tool.pyink]
|
| 126 |
+
# Formatting configuration to follow Google style-guide
|
| 127 |
+
line-length = 80
|
| 128 |
+
unstable = true
|
| 129 |
+
pyink-indentation = 2
|
| 130 |
+
pyink-use-majority-quotes = true
|
| 131 |
+
exclude = 'src/alphagenome/protos'
|
alphagenome/source/scripts/process_gtf.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Script to process GTF into feather file."""
|
| 16 |
+
|
| 17 |
+
from absl import app
|
| 18 |
+
from absl import flags
|
| 19 |
+
from absl import logging
|
| 20 |
+
import pyranges
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_GTF_PATH = flags.DEFINE_string(
|
| 24 |
+
'gtf_path', None, 'Path to GTF file.', required=True
|
| 25 |
+
)
|
| 26 |
+
_OUTPUT_PATH = flags.DEFINE_string(
|
| 27 |
+
'output_path', None, 'Path to output feather file.', required=True
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main(_) -> None:
|
| 32 |
+
logging.info('Reading GTF from %s', _GTF_PATH.value)
|
| 33 |
+
gtf = pyranges.read_gtf(_GTF_PATH.value, as_df=True)
|
| 34 |
+
|
| 35 |
+
gtf['gene_id_nopatch'] = gtf['gene_id'].str.split('.', expand=True)[0]
|
| 36 |
+
|
| 37 |
+
logging.info('Writing GTF to %s', _OUTPUT_PATH.value)
|
| 38 |
+
gtf.to_feather(_OUTPUT_PATH.value)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if __name__ == '__main__':
|
| 42 |
+
app.run(main)
|
alphagenome/source/src/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
src Package Initialization File
|
| 4 |
+
"""
|
alphagenome/source/src/alphagenome/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""A Python SDK for interacting and visualizing genomic models."""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
__version__ = '0.2.0'
|
alphagenome/source/src/alphagenome/colab_utils.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# https://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Utility functions for Google Colab."""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_api_key(secret: str = 'ALPHA_GENOME_API_KEY'):
|
| 21 |
+
"""Returns API key from environment variable or Colab secrets.
|
| 22 |
+
|
| 23 |
+
Tries to retrieve the API key from the environment first. If not found,
|
| 24 |
+
attempts to retrieve it from Colab secrets (if running in Colab).
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
secret: The name of the environment variable or Colab secret key to
|
| 28 |
+
retrieve.
|
| 29 |
+
|
| 30 |
+
Raises:
|
| 31 |
+
ValueError: If the API key cannot be found in the environment or Colab
|
| 32 |
+
secrets.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
if api_key := os.environ.get(secret):
|
| 36 |
+
return api_key
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# pylint: disable=g-import-not-at-top, import-outside-toplevel
|
| 40 |
+
from google.colab import userdata # pytype: disable=import-error
|
| 41 |
+
# pylint: enable=g-import-not-at-top, import-outside-toplevel
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
api_key = userdata.get(secret)
|
| 45 |
+
return api_key
|
| 46 |
+
except (
|
| 47 |
+
userdata.NotebookAccessError,
|
| 48 |
+
userdata.SecretNotFoundError,
|
| 49 |
+
userdata.TimeoutException,
|
| 50 |
+
) as e:
|
| 51 |
+
raise ValueError(
|
| 52 |
+
f'Cannot find or access API key in Colab secrets with {secret=}. Make'
|
| 53 |
+
' sure you have added the API key to Colab secrets and enabled'
|
| 54 |
+
' access. See'
|
| 55 |
+
' https://www.alphagenomedocs.com/installation.html#add-api-key-to-secrets'
|
| 56 |
+
' for more details.'
|
| 57 |
+
) from e
|
| 58 |
+
except ImportError:
|
| 59 |
+
# Not running in Colab.
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
raise ValueError(f'Cannot find API key with {secret=}.')
|
alphagenome/source/src/alphagenome/colab_utils_test.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
from unittest import mock
|
| 18 |
+
|
| 19 |
+
from absl.testing import absltest
|
| 20 |
+
from alphagenome import colab_utils
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_TEST_SECRET_KEY = '_TEST_ALPHAGENOME_API_KEY'
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ColabUtilsTest(absltest.TestCase):
|
| 27 |
+
|
| 28 |
+
def test_get_api_key_from_environment(self):
|
| 29 |
+
with mock.patch.dict(os.environ, {_TEST_SECRET_KEY: 'foo'}):
|
| 30 |
+
self.assertEqual(colab_utils.get_api_key(_TEST_SECRET_KEY), 'foo')
|
| 31 |
+
|
| 32 |
+
def test_get_api_key_from_environment_not_found_raises_error(self):
|
| 33 |
+
with self.assertRaisesRegex(
|
| 34 |
+
ValueError,
|
| 35 |
+
f"Cannot find API key with secret='{_TEST_SECRET_KEY}'.",
|
| 36 |
+
):
|
| 37 |
+
_ = colab_utils.get_api_key(_TEST_SECRET_KEY)
|
| 38 |
+
|
| 39 |
+
def test_get_api_key_from_colab_secrets(self):
|
| 40 |
+
mock_colab = mock.MagicMock()
|
| 41 |
+
mock_colab.userdata.get.return_value = 'bar'
|
| 42 |
+
|
| 43 |
+
with mock.patch.dict(
|
| 44 |
+
sys.modules, {'google': mock.MagicMock(), 'google.colab': mock_colab}
|
| 45 |
+
):
|
| 46 |
+
self.assertEqual(colab_utils.get_api_key(), 'bar')
|
| 47 |
+
|
| 48 |
+
def test_get_api_key_from_colab_secrets_not_found_raises_error(self):
|
| 49 |
+
mock_colab = mock.MagicMock()
|
| 50 |
+
mock_colab.userdata.NotebookAccessError = Exception
|
| 51 |
+
mock_colab.userdata.SecretNotFoundError = Exception
|
| 52 |
+
mock_colab.userdata.TimeoutException = Exception
|
| 53 |
+
|
| 54 |
+
mock_colab.userdata.get.side_effect = Exception()
|
| 55 |
+
|
| 56 |
+
with mock.patch.dict(
|
| 57 |
+
sys.modules, {'google': mock.MagicMock(), 'google.colab': mock_colab}
|
| 58 |
+
):
|
| 59 |
+
secret = 'my_secret'
|
| 60 |
+
with self.assertRaisesRegex(
|
| 61 |
+
ValueError,
|
| 62 |
+
f'Cannot find or access API key in Colab secrets with {secret=}.',
|
| 63 |
+
):
|
| 64 |
+
_ = colab_utils.get_api_key(secret)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == '__main__':
|
| 68 |
+
absltest.main()
|
alphagenome/source/src/alphagenome/data/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Data classes for interacting and visualizing genomic models."""
|
alphagenome/source/src/alphagenome/data/fold_intervals.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Genomics intervals used for training model folds."""
|
| 16 |
+
|
| 17 |
+
import enum
|
| 18 |
+
|
| 19 |
+
from alphagenome.models import dna_client
|
| 20 |
+
import immutabledict
|
| 21 |
+
import pandas as pd
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
_DEFAULT_EXAMPLE_REGIONS = immutabledict.immutabledict({
|
| 25 |
+
dna_client.Organism.HOMO_SAPIENS: (
|
| 26 |
+
'https://github.com/calico/borzoi/raw/'
|
| 27 |
+
'5c9358222b5026abb733ed5fb84f3f6c77239b37/data/sequences_human.bed.gz'
|
| 28 |
+
),
|
| 29 |
+
dna_client.Organism.MUS_MUSCULUS: (
|
| 30 |
+
'https://github.com/calico/borzoi/raw/'
|
| 31 |
+
'5c9358222b5026abb733ed5fb84f3f6c77239b37/data/sequences_mouse.bed.gz'
|
| 32 |
+
),
|
| 33 |
+
})
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Subset(enum.Enum):
|
| 37 |
+
"""Subset of the data."""
|
| 38 |
+
|
| 39 |
+
TRAIN = 0
|
| 40 |
+
VALID = 1
|
| 41 |
+
TEST = 2
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Fold ONE is aligned with all trained Borzoi checkpoints: 3 and 4 are held out.
|
| 45 |
+
_VALID_FOLD = immutabledict.immutabledict({
|
| 46 |
+
0: 'fold0',
|
| 47 |
+
1: 'fold3',
|
| 48 |
+
2: 'fold2',
|
| 49 |
+
3: 'fold6',
|
| 50 |
+
-1: 'fold0',
|
| 51 |
+
})
|
| 52 |
+
|
| 53 |
+
_TEST_FOLD = immutabledict.immutabledict({
|
| 54 |
+
0: 'fold1',
|
| 55 |
+
1: 'fold4',
|
| 56 |
+
2: 'fold5',
|
| 57 |
+
3: 'fold7',
|
| 58 |
+
-1: 'fold1',
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
_MODEL_VERSION_TO_FOLD = immutabledict.immutabledict({
|
| 62 |
+
dna_client.ModelVersion.FOLD_0: 0,
|
| 63 |
+
dna_client.ModelVersion.FOLD_1: 1,
|
| 64 |
+
dna_client.ModelVersion.FOLD_2: 2,
|
| 65 |
+
dna_client.ModelVersion.FOLD_3: 3,
|
| 66 |
+
dna_client.ModelVersion.ALL_FOLDS: -1,
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_all_folds() -> list[str]:
|
| 71 |
+
"""Returns the names of all data folds."""
|
| 72 |
+
return [f'fold{i}' for i in range(8)]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def get_fold_names(
|
| 76 |
+
model_version: dna_client.ModelVersion, subset: Subset
|
| 77 |
+
) -> list[str]:
|
| 78 |
+
"""Returns the data folds used for the model version."""
|
| 79 |
+
match subset:
|
| 80 |
+
case Subset.VALID:
|
| 81 |
+
return [_VALID_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
|
| 82 |
+
case Subset.TEST:
|
| 83 |
+
return [_TEST_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
|
| 84 |
+
case Subset.TRAIN:
|
| 85 |
+
all_folds = get_all_folds()
|
| 86 |
+
if _MODEL_VERSION_TO_FOLD[model_version] == -1:
|
| 87 |
+
return all_folds
|
| 88 |
+
remove_folds = get_fold_names(
|
| 89 |
+
model_version, Subset.VALID
|
| 90 |
+
) + get_fold_names(model_version, Subset.TEST)
|
| 91 |
+
for fold in remove_folds:
|
| 92 |
+
all_folds.remove(fold)
|
| 93 |
+
return all_folds
|
| 94 |
+
case _:
|
| 95 |
+
raise ValueError(f'Unknown {subset=}')
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def get_fold_intervals(
|
| 99 |
+
model_version: dna_client.ModelVersion,
|
| 100 |
+
organism: dna_client.Organism,
|
| 101 |
+
subset: Subset,
|
| 102 |
+
example_regions_path: str | None = None,
|
| 103 |
+
) -> pd.DataFrame:
|
| 104 |
+
"""Returns the training intervals for the model version."""
|
| 105 |
+
if example_regions_path is None:
|
| 106 |
+
example_regions_path = _DEFAULT_EXAMPLE_REGIONS[organism]
|
| 107 |
+
|
| 108 |
+
example_regions = pd.read_csv(
|
| 109 |
+
example_regions_path,
|
| 110 |
+
sep='\t',
|
| 111 |
+
names=['chromosome', 'start', 'end', 'fold'],
|
| 112 |
+
)
|
| 113 |
+
return example_regions[
|
| 114 |
+
example_regions.fold.isin(get_fold_names(model_version, subset))
|
| 115 |
+
]
|