kabudadada commited on
Commit
a1f2eee
·
1 Parent(s): 034d7df

Add essential alphagenome source files only

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. alphagenome/source +0 -1
  3. alphagenome/source/.gitattributes +1 -0
  4. alphagenome/source/.github/ISSUE_TEMPLATE/bug_report.yml +48 -0
  5. alphagenome/source/.github/ISSUE_TEMPLATE/config.yml +5 -0
  6. alphagenome/source/.github/workflows/presubmit_checks.yml +37 -0
  7. alphagenome/source/.github/workflows/release.yaml +45 -0
  8. alphagenome/source/.pylintrc +458 -0
  9. alphagenome/source/.readthedocs.yaml +42 -0
  10. alphagenome/source/CHANGELOG.md +42 -0
  11. alphagenome/source/CONTRIBUTING.md +25 -0
  12. alphagenome/source/LICENSE +202 -0
  13. alphagenome/source/README.md +235 -0
  14. alphagenome/source/__init__.py +4 -0
  15. alphagenome/source/colabs/batch_variant_scoring.ipynb +0 -0
  16. alphagenome/source/colabs/essential_commands.ipynb +1405 -0
  17. alphagenome/source/colabs/example_analysis_workflow.ipynb +0 -0
  18. alphagenome/source/colabs/quick_start.ipynb +0 -0
  19. alphagenome/source/colabs/tissue_ontology_mapping.ipynb +0 -0
  20. alphagenome/source/colabs/visualization_modality_tour.ipynb +0 -0
  21. alphagenome/source/conftest.py +22 -0
  22. alphagenome/source/docs/Makefile +20 -0
  23. alphagenome/source/docs/README.md +6 -0
  24. alphagenome/source/docs/make.bat +49 -0
  25. alphagenome/source/docs/source/_templates/autosummary/class.rst +55 -0
  26. alphagenome/source/docs/source/api/data.md +89 -0
  27. alphagenome/source/docs/source/api/index.md +46 -0
  28. alphagenome/source/docs/source/api/interpretation.md +16 -0
  29. alphagenome/source/docs/source/api/models.md +55 -0
  30. alphagenome/source/docs/source/api/visualization.md +60 -0
  31. alphagenome/source/docs/source/conf.py +206 -0
  32. alphagenome/source/docs/source/exploring_model_metadata.md +93 -0
  33. alphagenome/source/docs/source/faqs.md +378 -0
  34. alphagenome/source/docs/source/index.md +82 -0
  35. alphagenome/source/docs/source/installation.md +73 -0
  36. alphagenome/source/docs/source/references.md +7 -0
  37. alphagenome/source/docs/source/refs.bib +50 -0
  38. alphagenome/source/docs/source/tutorials/index.md +55 -0
  39. alphagenome/source/docs/source/user_guides/index.md +46 -0
  40. alphagenome/source/docs/source/variant_scoring.md +257 -0
  41. alphagenome/source/docs/source/visualization_library_basics.md +153 -0
  42. alphagenome/source/hatch_build.py +49 -0
  43. alphagenome/source/pyproject.toml +131 -0
  44. alphagenome/source/scripts/process_gtf.py +42 -0
  45. alphagenome/source/src/__init__.py +4 -0
  46. alphagenome/source/src/alphagenome/__init__.py +18 -0
  47. alphagenome/source/src/alphagenome/colab_utils.py +62 -0
  48. alphagenome/source/src/alphagenome/colab_utils_test.py +68 -0
  49. alphagenome/source/src/alphagenome/data/__init__.py +15 -0
  50. alphagenome/source/src/alphagenome/data/fold_intervals.py +115 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.ipynb filter=lfs diff=lfs merge=lfs -text
alphagenome/source DELETED
@@ -1 +0,0 @@
1
- Subproject commit b7d3963ce241c2390ea18bb99fa0722e1c169952
 
 
alphagenome/source/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ipynb linguist-documentation
alphagenome/source/.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ description: >-
4
+ Report a bug or unexpected behavior to help us improve AlphaGenome.
5
+ labels:
6
+ - bug
7
+
8
+ body:
9
+ - type: markdown
10
+ attributes:
11
+ value: >
12
+ ## Thank you for helping us improve AlphaGenome!
13
+
14
+ * Please verify that your issue has not been reported using
15
+ [Issue search][issue search].
16
+
17
+ * If you have a question about usage, please
18
+ consider [starting a discussion][Discussions].
19
+
20
+ * If you prefer a non-templated issue report, click [here][Raw report].
21
+
22
+ [Discussions]: https://www.alphagenomecommunity.com/
23
+
24
+ [issue search]: https://github.com/google-deepmind/alphagenome/search?q=is%3Aissue&type=issues
25
+
26
+ [Raw report]: https://github.com/google-deepmind/alphagenome/issues/new?template=none
27
+ - type: textarea
28
+ attributes:
29
+ label: Description
30
+ description: A concise description of the bug.
31
+ placeholder: |
32
+ Text may use markdown formatting.
33
+ ```python
34
+ # for codeblocks, use triple backticks
35
+ ```
36
+ validations:
37
+ required: true
38
+ - type: textarea
39
+ attributes:
40
+ label: System info (python version, alphagenome version, etc.)
41
+ description: >-
42
+ Include the output of `import alphagenome; alphagenome.__version__`
43
+ placeholder: |
44
+ ```
45
+ ...
46
+ ```
47
+ validations:
48
+ required: true
alphagenome/source/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Have questions or need support?
4
+ url: https://www.alphagenomecommunity.com/
5
+ about: Please ask questions on our community forums.
alphagenome/source/.github/workflows/presubmit_checks.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: CI
16
+ on: [push, pull_request]
17
+ jobs:
18
+ test:
19
+ runs-on: ${{ matrix.os }}
20
+ strategy:
21
+ fail-fast: false
22
+ matrix:
23
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
24
+ os: [ubuntu-latest]
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ - name: Set up Python ${{ matrix.python-version }}
28
+ uses: actions/setup-python@v5
29
+ with:
30
+ python-version: ${{ matrix.python-version }}
31
+ - name: Install alphagenome with dependencies
32
+ run: |
33
+ python -m pip install -U pip hatch
34
+ - name: Check
35
+ run: python -m hatch run check:all
36
+ - name: Unit tests
37
+ run: python -m hatch test
alphagenome/source/.github/workflows/release.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Release
16
+
17
+ on:
18
+ release:
19
+ types: [published]
20
+
21
+ # Use "trusted publishing", see https://docs.pypi.org/trusted-publishers/
22
+ jobs:
23
+ release:
24
+ name: Upload release to PyPI
25
+ runs-on: ubuntu-latest
26
+ environment:
27
+ name: pypi
28
+ url: https://pypi.org/p/alphagenome
29
+ permissions:
30
+ id-token: write
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+ with:
34
+ filter: blob:none
35
+ fetch-depth: 0
36
+ - name: Set up Python 3.12
37
+ uses: actions/setup-python@v5
38
+ with:
39
+ python-version: 3.12
40
+ - name: Install hatch
41
+ run: python -m pip install -U pip hatch
42
+ - name: Build package
43
+ run: python -m hatch build
44
+ - name: Publish package distributions to PyPI
45
+ uses: pypa/gh-action-pypi-publish@release/v1
alphagenome/source/.pylintrc ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This Pylint rcfile contains a best-effort configuration to uphold the
2
+ # best-practices and style described in the Google Python style guide:
3
+ # https://google.github.io/styleguide/pyguide.html
4
+ #
5
+ # Its original canonical open-source location is:
6
+ # https://google.github.io/styleguide/pylintrc
7
+ #
8
+ # Also includes some modifications specific to this repository.
9
+
10
+ [MASTER]
11
+
12
+ # Add files or directories to the ignore list. They should be base names, not
13
+ # paths.
14
+ ignore=third_party,
15
+ ./src/alphagenome/protos
16
+
17
+ # Add files or directories matching the regex patterns to the ignore list. The
18
+ # regex matches against base names, not paths.
19
+ ignore-patterns=
20
+
21
+ # Pickle collected data for later comparisons.
22
+ persistent=no
23
+
24
+ # List of plugins (as comma separated values of python modules names) to load,
25
+ # usually to register additional checkers.
26
+ load-plugins=
27
+
28
+ # Use multiple processes to speed up Pylint.
29
+ jobs=4
30
+
31
+ # Allow loading of arbitrary C extensions. Extensions are imported into the
32
+ # active Python interpreter and may run arbitrary code.
33
+ unsafe-load-any-extension=no
34
+
35
+ # A comma-separated list of package or module names from where C extensions may
36
+ # be loaded. Extensions are loading into the active Python interpreter and may
37
+ # run arbitrary code.
38
+ extension-pkg-allow-list=
39
+
40
+ # Minimum Python version to use for version dependent checks.
41
+ py-version=3.10
42
+
43
+ [MESSAGES CONTROL]
44
+
45
+ # Only show warnings with the listed confidence levels. Leave empty to show
46
+ # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
47
+ confidence=
48
+
49
+ # Enable the message, report, category or checker with the given id(s). You can
50
+ # either give multiple identifier separated by comma (,) or put this option
51
+ # multiple time (only on the command line, not in the configuration file where
52
+ # it should appear only once). See also the "--disable" option for examples.
53
+ #enable=
54
+
55
+ # Disable the message, report, category or checker with the given id(s). You
56
+ # can either give multiple identifiers separated by comma (,) or put this
57
+ # option multiple times (only on the command line, not in the configuration
58
+ # file where it should appear only once).You can also use "--disable=all" to
59
+ # disable everything first and then reenable specific checks. For example, if
60
+ # you want to run only the similarities checker, you can use "--disable=all
61
+ # --enable=similarities". If you want to run only the classes checker, but have
62
+ # no Warning level messages displayed, use"--disable=all --enable=classes
63
+ # --disable=W"
64
+ disable=abstract-method,
65
+ apply-builtin,
66
+ arguments-differ,
67
+ attribute-defined-outside-init,
68
+ backtick,
69
+ bad-option-value,
70
+ basestring-builtin,
71
+ buffer-builtin,
72
+ c-extension-no-member,
73
+ chained-comparison,
74
+ cmp-builtin,
75
+ cmp-method,
76
+ coerce-builtin,
77
+ coerce-method,
78
+ consider-iterating-dictionary,
79
+ consider-using-enumerate,
80
+ consider-using-in,
81
+ delslice-method,
82
+ div-method,
83
+ duplicate-code,
84
+ eq-without-hash,
85
+ execfile-builtin,
86
+ file-builtin,
87
+ filter-builtin-not-iterating,
88
+ fixme,
89
+ getslice-method,
90
+ global-statement,
91
+ hex-method,
92
+ idiv-method,
93
+ implicit-str-concat-in-sequence,
94
+ import-error,
95
+ import-self,
96
+ import-star-module-level,
97
+ inconsistent-return-statements,
98
+ input-builtin,
99
+ intern-builtin,
100
+ invalid-field-call,
101
+ invalid-str-codec,
102
+ locally-disabled,
103
+ long-builtin,
104
+ long-suffix,
105
+ map-builtin-not-iterating,
106
+ metaclass-assignment,
107
+ misplaced-comparison-constant,
108
+ missing-function-docstring,
109
+ missing-module-docstring,
110
+ next-method-called,
111
+ next-method-defined,
112
+ no-absolute-import,
113
+ no-else-break,
114
+ no-else-continue,
115
+ no-else-raise,
116
+ no-else-return,
117
+ no-init, # added
118
+ no-member,
119
+ no-name-in-module,
120
+ no-self-use,
121
+ nonzero-method,
122
+ not-an-iterable, # false positives around dataclasses
123
+ not-callable, # false positives for jax.jit
124
+ oct-method,
125
+ old-division,
126
+ old-ne-operator,
127
+ old-octal-literal,
128
+ old-raise-syntax,
129
+ parameter-unpacking,
130
+ print-statement,
131
+ raising-string,
132
+ range-builtin-not-iterating,
133
+ raw_input-builtin,
134
+ rdiv-method,
135
+ reduce-builtin,
136
+ relative-import,
137
+ reload-builtin,
138
+ round-builtin,
139
+ setslice-method,
140
+ signature-differs,
141
+ standarderror-builtin,
142
+ suppressed-message,
143
+ sys-max-int,
144
+ too-few-public-methods,
145
+ too-many-ancestors,
146
+ too-many-arguments,
147
+ too-many-boolean-expressions,
148
+ too-many-branches,
149
+ too-many-instance-attributes,
150
+ too-many-locals,
151
+ too-many-nested-blocks,
152
+ too-many-positional-arguments,
153
+ too-many-public-methods,
154
+ too-many-return-statements,
155
+ too-many-statements,
156
+ trailing-newlines,
157
+ unichr-builtin,
158
+ unicode-builtin,
159
+ unnecessary-comprehension,
160
+ unnecessary-lambda-assignment,
161
+ unnecessary-pass,
162
+ unpacking-in-except,
163
+ use-dict-literal,
164
+ useless-else-on-loop,
165
+ useless-object-inheritance,
166
+ useless-suppression,
167
+ using-cmp-argument,
168
+ wrong-import-order,
169
+ xrange-builtin,
170
+ zip-builtin-not-iterating,
171
+
172
+ [REPORTS]
173
+
174
+ # Set the output format. Available formats are text, parseable, colorized, msvs
175
+ # (visual studio) and html. You can also give a reporter class, eg
176
+ # mypackage.mymodule.MyReporterClass.
177
+ output-format=text
178
+
179
+ # Tells whether to display a full report or only the messages
180
+ reports=no
181
+
182
+ # Python expression which should return a note less than 10 (10 is the highest
183
+ # note). You have access to the variables errors warning, statement which
184
+ # respectively contain the number of errors / warnings messages and the total
185
+ # number of statements analyzed. This is used by the global evaluation report
186
+ # (RP0004).
187
+ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
188
+
189
+ # Template used to display messages. This is a python new-style format string
190
+ # used to format the message information. See doc for all details
191
+ #msg-template=
192
+
193
+
194
+ [BASIC]
195
+
196
+ # Good variable names which should always be accepted, separated by a comma
197
+ good-names=main,_
198
+
199
+ # Bad variable names which should always be refused, separated by a comma
200
+ bad-names=
201
+
202
+ # Colon-delimited sets of names that determine each other's naming style when
203
+ # the name regexes allow several styles.
204
+ name-group=
205
+
206
+ # Include a hint for the correct naming format with invalid-name
207
+ include-naming-hint=no
208
+
209
+ # List of decorators that produce properties, such as abc.abstractproperty. Add
210
+ # to this list to register other decorators that produce valid properties.
211
+ property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
212
+
213
+ # Regular expression matching correct function names
214
+ function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
215
+
216
+ # Regular expression matching correct variable names
217
+ variable-rgx=^[a-z][a-z0-9_]*$
218
+
219
+ # Regular expression matching correct constant names
220
+ const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
221
+
222
+ # Regular expression matching correct attribute names
223
+ attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
224
+
225
+ # Regular expression matching correct argument names
226
+ argument-rgx=^[a-z][a-z0-9_]*$
227
+
228
+ # Regular expression matching correct class attribute names
229
+ class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
230
+
231
+ # Regular expression matching correct inline iteration names
232
+ inlinevar-rgx=^[a-z][a-z0-9_]*$
233
+
234
+ # Regular expression matching correct class names
235
+ class-rgx=^_?[A-Z][a-zA-Z0-9]*$
236
+
237
+ # Regular expression matching correct module names
238
+ module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
239
+
240
+ # Regular expression matching correct method names
241
+ method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
242
+
243
+ # Regular expression which should only match function or class names that do
244
+ # not require a docstring.
245
+ no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
246
+
247
+ # Minimum line length for functions/classes that require docstrings, shorter
248
+ # ones are exempt.
249
+ docstring-min-length=10
250
+
251
+
252
+ [TYPECHECK]
253
+
254
+ # List of decorators that produce context managers, such as
255
+ # contextlib.contextmanager. Add to this list to register other decorators that
256
+ # produce valid context managers.
257
+ contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
258
+
259
+ # Tells whether missing members accessed in mixin class should be ignored. A
260
+ # mixin class is detected if its name ends with "mixin" (case insensitive).
261
+ ignore-mixin-members=yes
262
+
263
+ # List of module names for which member attributes should not be checked
264
+ # (useful for modules/projects where namespaces are manipulated during runtime
265
+ # and thus existing member attributes cannot be deduced by static analysis. It
266
+ # supports qualified module names, as well as Unix pattern matching.
267
+ ignored-modules=
268
+
269
+ # List of class names for which member attributes should not be checked (useful
270
+ # for classes with dynamically set attributes). This supports the use of
271
+ # qualified names.
272
+ ignored-classes=optparse.Values,thread._local,_thread._local
273
+
274
+ # List of members which are set dynamically and missed by pylint inference
275
+ # system, and so shouldn't trigger E1101 when accessed. Python regular
276
+ # expressions are accepted.
277
+ generated-members=
278
+
279
+
280
+ [FORMAT]
281
+
282
+ # Maximum number of characters on a single line.
283
+ max-line-length=80
284
+
285
+ # lines made too long by directives to pytype.
286
+
287
+ # Regexp for a line that is allowed to be longer than the limit.
288
+ ignore-long-lines=(?x)
289
+ (^\s*(import|from)\s
290
+ |\$Id:\s\/\/depot\/.+#\d+\s\$
291
+ |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
292
+ |^\s*\#\ LINT\.ThenChange
293
+ |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
294
+ |pylint
295
+ |"""
296
+ |\#
297
+ |lambda
298
+ |(https?|ftp):)
299
+
300
+ # Allow the body of an if to be on the same line as the test if there is no
301
+ # else.
302
+ single-line-if-stmt=yes
303
+
304
+ # Maximum number of lines in a module
305
+ max-module-lines=99999
306
+
307
+ # String used as indentation unit. The internal Google style guide mandates 2
308
+ # spaces. Google's externaly-published style guide says 4, consistent with
309
+ # PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google
310
+ # projects (like TensorFlow).
311
+ indent-string=' '
312
+
313
+ # Number of spaces of indent required inside a hanging or continued line.
314
+ indent-after-paren=4
315
+
316
+ # Expected format of line ending, e.g., empty (any line ending), LF or CRLF.
317
+ expected-line-ending-format=
318
+
319
+
320
+ [MISCELLANEOUS]
321
+
322
+ # List of note tags to take in consideration, separated by a comma.
323
+ notes=TODO
324
+
325
+
326
+ [STRING]
327
+
328
+ # This flag controls whether inconsistent-quotes generates a warning when the
329
+ # character used as a quote delimiter is used inconsistently within a module.
330
+ check-quote-consistency=yes
331
+
332
+
333
+ [VARIABLES]
334
+
335
+ # Tells whether we should check for unused import in __init__ files.
336
+ init-import=no
337
+
338
+ # A regular expression matching the name of dummy variables (i.e., expectedly
339
+ # not used).
340
+ dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
341
+
342
+ # List of additional names supposed to be defined in builtins. Remember that
343
+ # you should avoid to define new builtins when possible.
344
+ additional-builtins=
345
+
346
+ # List of strings which can identify a callback function by name. A callback
347
+ # name must start or end with one of those strings.
348
+ callbacks=cb_,_cb
349
+
350
+ # List of qualified module names which can have objects that can redefine
351
+ # builtins.
352
+ redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
353
+
354
+
355
+ [LOGGING]
356
+
357
+ # Logging modules to check that the string format arguments are in logging
358
+ # function parameter format
359
+ logging-modules=logging,absl.logging,tensorflow.io.logging
360
+
361
+
362
+ [SIMILARITIES]
363
+
364
+ # Minimum lines number of a similarity.
365
+ min-similarity-lines=4
366
+
367
+ # Ignore comments when computing similarities.
368
+ ignore-comments=yes
369
+
370
+ # Ignore docstrings when computing similarities.
371
+ ignore-docstrings=yes
372
+
373
+ # Ignore imports when computing similarities.
374
+ ignore-imports=no
375
+
376
+
377
+ [SPELLING]
378
+
379
+ # Spelling dictionary name. Available dictionaries: none. To make it working
380
+ # install python-enchant package.
381
+ spelling-dict=
382
+
383
+ # List of comma separated words that should not be checked.
384
+ spelling-ignore-words=
385
+
386
+ # A path to a file that contains private dictionary; one word per line.
387
+ spelling-private-dict-file=
388
+
389
+ # Tells whether to store unknown words to indicated private dictionary in
390
+ # --spelling-private-dict-file option instead of raising a message.
391
+ spelling-store-unknown-words=no
392
+
393
+
394
+ [IMPORTS]
395
+
396
+ # Deprecated modules which should not be used, separated by a comma
397
+ deprecated-modules=regsub,
398
+ TERMIOS,
399
+ Bastion,
400
+ rexec,
401
+ sets
402
+
403
+ # Create a graph of every (i.e., internal and external) dependencies in the
404
+ # given file (report RP0402 must not be disabled)
405
+ import-graph=
406
+
407
+ # Create a graph of external dependencies in the given file (report RP0402 must
408
+ # not be disabled)
409
+ ext-import-graph=
410
+
411
+ # Create a graph of internal dependencies in the given file (report RP0402 must
412
+ # not be disabled)
413
+ int-import-graph=
414
+
415
+ # Force import order to recognize a module as part of the standard
416
+ # compatibility libraries.
417
+ known-standard-library=
418
+
419
+ # Force import order to recognize a module as part of a third party library.
420
+ known-third-party=enchant, absl
421
+
422
+ # Analyse import fallback blocks. This can be used to support both Python 2 and
423
+ # 3 compatible code, which means that the block might have code that exists
424
+ # only in one or another interpreter, leading to false positives when analysed.
425
+ analyse-fallback-blocks=no
426
+
427
+
428
+ [CLASSES]
429
+
430
+ # List of method names used to declare (i.e., assign) instance attributes.
431
+ defining-attr-methods=__init__,
432
+ __new__,
433
+ setUp
434
+
435
+ # List of member names, which should be excluded from the protected access
436
+ # warning.
437
+ exclude-protected=_asdict,
438
+ _fields,
439
+ _replace,
440
+ _source,
441
+ _make
442
+
443
+ # List of valid names for the first argument in a class method.
444
+ valid-classmethod-first-arg=cls,
445
+ class_
446
+
447
+ # List of valid names for the first argument in a metaclass class method.
448
+ valid-metaclass-classmethod-first-arg=mcs
449
+
450
+
451
+ [EXCEPTIONS]
452
+
453
+ # Exceptions that will emit a warning when being caught. Defaults to
454
+ # "Exception"
455
+ overgeneral-exceptions=builtins.StandardError,
456
+ builtins.Exception,
457
+ builtins.BaseException
458
+
alphagenome/source/.readthedocs.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Read the Docs configuration file for Sphinx projects
16
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
17
+
18
+ # Required
19
+ version: 2
20
+
21
+ # Set the OS, Python version and other tools you might need
22
+ build:
23
+ os: ubuntu-22.04
24
+ tools:
25
+ python: "3.10"
26
+ jobs:
27
+ pre_build:
28
+ # Copy colabs into docs/source so they can be included in the documentation.
29
+ - cp -r colabs docs/source/
30
+
31
+ # Build documentation in the "docs/" directory with Sphinx
32
+ sphinx:
33
+ builder: html
34
+ configuration: docs/source/conf.py
35
+ fail_on_warning: false
36
+
37
+ python:
38
+ install:
39
+ - method: pip
40
+ path: .
41
+ extra_requirements:
42
+ - docs
alphagenome/source/CHANGELOG.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to
7
+ [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
8
+
9
+ ## [0.2.0]
10
+
11
+ ### Added
12
+
13
+ - Add `is_insertion` and `is_deletion` properties to `Variant`.
14
+ - Add `DnaModel` abstract base class.
15
+ - Add support for center mask scoring over the entire sequence by passing
16
+ `None` for width.
17
+
18
+ ### Changed
19
+
20
+ - Move RPC requests and responses to `dna_model_service.proto`.
21
+ - Move functionality to convert `TrackData` to/from protocol buffers to
22
+ utility module.
23
+
24
+ ## [0.1.0]
25
+
26
+ ### Added
27
+
28
+ - Add `L2_DIFF_LOG1P` variant scoring aggregation type.
29
+ - Add `is_snv` property to `Variant`.
30
+ - Add non-zero mean track metadata field to model output metadata.
31
+ - Add optional interval argument to `predict_sequence`.
32
+
33
+ ## [0.0.2]
34
+
35
+ ### Added
36
+
37
+ - `colab_utils` module to wrap reading API keys from environment variables or
38
+ Google Colab secrets.
39
+
40
+ ## [0.0.1]
41
+
42
+ Initial release.
alphagenome/source/CONTRIBUTING.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Contribute
2
+
3
+ ## Contributor License Agreement
4
+
5
+ Contributions to this project must be accompanied by a Contributor License
6
+ Agreement. You (or your employer) retain the copyright to your contribution,
7
+ this simply gives us permission to use and redistribute your contributions as
8
+ part of the project. Head over to <https://cla.developers.google.com/> to see
9
+ your current agreements on file or to sign a new one.
10
+
11
+ You generally only need to submit a CLA once, so if you've already submitted one
12
+ (even if it was for a different project), you probably don't need to do it
13
+ again.
14
+
15
+ ## Code reviews
16
+
17
+ All submissions, including submissions by project members, require review. We
18
+ use GitHub pull requests for this purpose. Consult
19
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
20
+ information on using pull requests.
21
+
22
+ ## Community Guidelines
23
+
24
+ This project follows [Google's Open Source Community
25
+ Guidelines](https://opensource.google/conduct/).
alphagenome/source/LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
alphagenome/source/README.md ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ![AlphaGenome header image](docs/source/_static/header.png)
2
+
3
+ # AlphaGenome
4
+
5
+ ![PyPI Python version](https://img.shields.io/pypi/pyversions/AlphaGenome)
6
+ ![Presubmit Checks](https://github.com/google-deepmind/alphagenome/actions/workflows/presubmit_checks.yml/badge.svg)
7
+
8
+ [**Get API key**](https://deepmind.google.com/science/alphagenome) |
9
+ [**Quick start**](#quick-start) | [**Installation**](#installation) |
10
+ [**Documentation**](https://www.alphagenomedocs.com/) |
11
+ [**Community**](https://www.alphagenomecommunity.com) |
12
+ [**Terms of Use**](https://deepmind.google.com/science/alphagenome/terms)
13
+
14
+ The AlphaGenome API provides access to AlphaGenome, Google DeepMind’s unifying
15
+ model for deciphering the regulatory code within DNA sequences. This repository
16
+ contains client-side code, examples and documentation to help you use the
17
+ AlphaGenome API.
18
+
19
+ AlphaGenome offers multimodal predictions, encompassing diverse functional
20
+ outputs such as gene expression, splicing patterns, chromatin features, and
21
+ contact maps (see [diagram below](#model_overview)). The model analyzes DNA
22
+ sequences of up to 1 million base pairs in length and can deliver predictions at
23
+ single base-pair resolution for most outputs. AlphaGenome achieves
24
+ state-of-the-art performance across a range of genomic prediction benchmarks,
25
+ including numerous diverse variant effect prediction tasks (detailed in
26
+ [Avsec et al. 2025](https://doi.org/10.1101/2025.06.25.661532)).
27
+
28
+ The API is offered free of charge for
29
+ [non-commercial use](https://deepmind.google.com/science/alphagenome/terms)
30
+ (subject to the terms of use). Query rates vary based on demand – it is well
31
+ suited for smaller to medium-scale analyses such as analysing a limited number
32
+ of genomic regions or variants requiring 1000s of predictions, but is likely not
33
+ suitable for large scale analyses requiring more than 1 million predictions.
34
+ Once you obtain your API key, you can easily get started by following our
35
+ [Quick Start Guide](#quick-start), or watching our
36
+ [AlphaGenome 101 tutorial](https://youtu.be/Xbvloe13nak).
37
+
38
+ <a id='model_overview'>
39
+
40
+ ![Model overview](docs/source/_static/model_overview.png)
41
+
42
+ </a>
43
+
44
+ The documentation also covers a set of comprehensive tutorials, variant scoring
45
+ strategies to efficiently score variant effects, and a visualization library to
46
+ generate `matplotlib` figures for the different output modalities.
47
+
48
+ We cover additional details of the capabilities and limitations in our
49
+ documentation. For support and feedback:
50
+
51
+ - Please submit bugs and any code-related issues on
52
+ [GitHub](https://github.com/google-deepmind/alphagenome/issues).
53
+ - For general feedback, questions about usage, and/or feature requests, please
54
+ use the [community forum](https://www.alphagenomecommunity.com) – it’s
55
+ actively monitored by our team so you're likely to find answers and insights
56
+ faster.
57
+ - If you can't find what you're looking for, please get in touch with the
58
+ AlphaGenome team on alphagenome@google.com and we will be happy to assist
59
+ you with questions. We’re working hard to answer all inquiries but there may
60
+ be a short delay in our response due to the high volume we are receiving.
61
+
62
+ ## Quick start
63
+
64
+ The quickest way to get started is to run our example notebooks in
65
+ [Google Colab](https://colab.research.google.com/). Here are some starter
66
+ notebooks:
67
+
68
+ - [Quick start](https://colab.research.google.com/github/google-deepmind/alphagenome/blob/main/colabs/quick_start.ipynb):
69
+ An introduction to quickly get you started with using the model and making
70
+ predictions.
71
+ - [Visualizing predictions](https://colab.research.google.com/github/google-deepmind/alphagenome/blob/main/colabs/visualization_modality_tour.ipynb):
72
+ Learn how to visualize different model predictions using the visualization
73
+ libraries.
74
+
75
+ Alternatively, you can dive straight in by following the
76
+ [installation guide](#installation) and start writing code! Here's an example of
77
+ making a variant prediction:
78
+
79
+ ```python
80
+ from alphagenome.data import genome
81
+ from alphagenome.models import dna_client
82
+ from alphagenome.visualization import plot_components
83
+ import matplotlib.pyplot as plt
84
+
85
+
86
+ API_KEY = 'MyAPIKey'
87
+ model = dna_client.create(API_KEY)
88
+
89
+ interval = genome.Interval(chromosome='chr22', start=35677410, end=36725986)
90
+ variant = genome.Variant(
91
+ chromosome='chr22',
92
+ position=36201698,
93
+ reference_bases='A',
94
+ alternate_bases='C',
95
+ )
96
+
97
+ outputs = model.predict_variant(
98
+ interval=interval,
99
+ variant=variant,
100
+ ontology_terms=['UBERON:0001157'],
101
+ requested_outputs=[dna_client.OutputType.RNA_SEQ],
102
+ )
103
+
104
+ plot_components.plot(
105
+ [
106
+ plot_components.OverlaidTracks(
107
+ tdata={
108
+ 'REF': outputs.reference.rna_seq,
109
+ 'ALT': outputs.alternate.rna_seq,
110
+ },
111
+ colors={'REF': 'dimgrey', 'ALT': 'red'},
112
+ ),
113
+ ],
114
+ interval=outputs.reference.rna_seq.interval.resize(2**15),
115
+ # Annotate the location of the variant as a vertical line.
116
+ annotations=[plot_components.VariantAnnotation([variant], alpha=0.8)],
117
+ )
118
+ plt.show()
119
+ ```
120
+
121
+ ## Installation
122
+
123
+ <!-- mdformat off(disable for [!TIP] format) -->
124
+
125
+ > [!TIP]
126
+ > You may optionally wish to create a
127
+ > [Python Virtual Environment](https://docs.python.org/3/tutorial/venv.html) to
128
+ > prevent conflicts with your system's Python environment.
129
+
130
+ <!-- mdformat on -->
131
+
132
+ To install `alphagenome`, clone a local copy of the repository and run `pip
133
+ install`:
134
+
135
+ ```bash
136
+ $ git clone https://github.com/google-deepmind/alphagenome.git
137
+ $ pip install ./alphagenome
138
+ ```
139
+
140
+ See [the documentation](https://www.alphagenomedocs.com/installation.html) for
141
+ information on alternative installation strategies.
142
+
143
+ ## Citing `alphagenome`
144
+
145
+ If you use AlphaGenome in your research, please cite using:
146
+
147
+ <!-- disableFinding(SNIPPET_INVALID_LANGUAGE) -->
148
+
149
+ ```bibtex
150
+ @article{alphagenome,
151
+ title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
152
+ author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
153
+ year={2025},
154
+ doi={https://doi.org/10.1101/2025.06.25.661532},
155
+ publisher={Cold Spring Harbor Laboratory},
156
+ journal={bioRxiv}
157
+ }
158
+ ```
159
+
160
+ <!-- enableFinding(SNIPPET_INVALID_LANGUAGE) -->
161
+
162
+ ## Acknowledgements
163
+
164
+ AlphaGenome communicates with and/or references the following separate libraries
165
+ and packages:
166
+
167
+ * [Abseil](https://github.com/abseil/abseil-py)
168
+ * [anndata](https://github.com/scverse/anndata)
169
+ * [gRPC](https://github.com/grpc/grpc)
170
+ * [immutabledict](https://github.com/corenting/immutabledict)
171
+ * [intervaltree](https://github.com/chaimleib/intervaltree)
172
+ * [jaxtyping](https://github.com/patrick-kidger/jaxtyping)
173
+ * [matplotlib](https://matplotlib.org/)
174
+ * [ml_dtypes](https://github.com/jax-ml/ml_dtypes)
175
+ * [NumPy](https://numpy.org/)
176
+ * [pandas](https://pandas.pydata.org/)
177
+ * [protobuf](https://developers.google.com/protocol-buffers/)
178
+ * [pyarrow](https://arrow.apache.org/)
179
+ * [SciPy](https://scipy.org/)
180
+ * [seaborn](https://seaborn.pydata.org/)
181
+ * [tqdm](https://github.com/tqdm/tqdm)
182
+ * [typeguard](https://github.com/agronholm/typeguard)
183
+ * [typing_extensions](https://github.com/python/typing_extensions)
184
+ * [zstandard](https://github.com/indygreg/python-zstandard)
185
+
186
+ We thank all their contributors and maintainers!
187
+
188
+ ## License and Disclaimer
189
+
190
+ Copyright 2024 Google LLC
191
+
192
+ All software in this repository is licensed under the Apache License, Version
193
+ 2.0 (Apache 2.0); you may not use this except in compliance with the Apache 2.0
194
+ license. You may obtain a copy of the Apache 2.0 license at:
195
+ https://www.apache.org/licenses/LICENSE-2.0.
196
+
197
+ Examples and documentation to help you use the AlphaGenome API are licensed
198
+ under the Creative Commons Attribution 4.0 International License (CC-BY). You
199
+ may obtain a copy of the CC-BY license at:
200
+ https://creativecommons.org/licenses/by/4.0/legalcode.
201
+
202
+ Unless required by applicable law or agreed to in writing, all software and
203
+ materials distributed here under the Apache 2.0 or CC-BY licenses are
204
+ distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
205
+ either express or implied. See the licenses for the specific language governing
206
+ permissions and limitations under those licenses.
207
+
208
+ This is not an official Google product.
209
+
210
+ ### Third-party software
211
+
212
+ Your use of any third-party software, libraries or code referenced in the
213
+ materials in this repository (including the libraries listed in the
214
+ [Acknowledgments](#acknowledgements) section) may be governed by separate terms
215
+ and conditions or license provisions. Your use of the third-party software,
216
+ libraries or code is subject to any such terms and you should check that you can
217
+ comply with any applicable restrictions or terms and conditions before use.
218
+
219
+ ### Reference Datasets
220
+
221
+ A modified version of the GENCODE dataset (which can be found here:
222
+ https://www.gencodegenes.org/human/releases.html) is released with the client
223
+ code package for illustrative purposes, and is available with reference to the
224
+ following:
225
+
226
+ - Copyright © 2024 EMBL-EBI
227
+ - The GENCODE dataset is subject to the EMBL-EBI terms of use, available at
228
+ https://www.ebi.ac.uk/about/terms-of-use.
229
+ - Citation: Frankish A, et al (2018) GENCODE reference annotation for the
230
+ human and mouse genome.
231
+ - Further details about GENCODE can be found at
232
+ https://www.gencodegenes.org/human/releases.html, with additional citation
233
+ information at https://www.gencodegenes.org/pages/publications.html and
234
+ further acknowledgements can be found at
235
+ https://www.gencodegenes.org/pages/gencode.html.
alphagenome/source/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ alphagenome Project Package Initialization File
4
+ """
alphagenome/source/colabs/batch_variant_scoring.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
alphagenome/source/colabs/essential_commands.ipynb ADDED
@@ -0,0 +1,1405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "id": "LIIksEJ7fbxF"
6
+ },
7
+ "cell_type": "markdown",
8
+ "source": [
9
+ "# Essential commands\n",
10
+ "The following describes essential commands for interacting with the AlphaGenome API. It is broken into two sections: data and methods.\n",
11
+ "\n"
12
+ ]
13
+ },
14
+ {
15
+ "metadata": {
16
+ "id": "gcms9aHWNnqs"
17
+ },
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "```{tip}\n",
21
+ "Open this tutorial in Google colab for interactive viewing.\n",
22
+ "```"
23
+ ]
24
+ },
25
+ {
26
+ "metadata": {
27
+ "executionInfo": {
28
+ "elapsed": 13,
29
+ "status": "ok",
30
+ "timestamp": 1749822645556,
31
+ "user": {
32
+ "displayName": "",
33
+ "userId": ""
34
+ },
35
+ "user_tz": -60
36
+ },
37
+ "id": "iEs6z4rGe3lk"
38
+ },
39
+ "cell_type": "code",
40
+ "source": [
41
+ "# @title Install AlphaGenome\n",
42
+ "\n",
43
+ "# @markdown Run this cell to install AlphaGenome.\n",
44
+ "from IPython.display import clear_output\n",
45
+ "! pip install alphagenome\n",
46
+ "clear_output()"
47
+ ],
48
+ "outputs": [],
49
+ "execution_count": 1
50
+ },
51
+ {
52
+ "metadata": {
53
+ "id": "rKyGK083Wwh7"
54
+ },
55
+ "cell_type": "markdown",
56
+ "source": [
57
+ "# Imports"
58
+ ]
59
+ },
60
+ {
61
+ "metadata": {
62
+ "executionInfo": {
63
+ "elapsed": 1070,
64
+ "status": "ok",
65
+ "timestamp": 1749822646891,
66
+ "user": {
67
+ "displayName": "",
68
+ "userId": ""
69
+ },
70
+ "user_tz": -60
71
+ },
72
+ "id": "V7MD3DBEfJwf"
73
+ },
74
+ "cell_type": "code",
75
+ "source": [
76
+ "from alphagenome.data import genome\n",
77
+ "from alphagenome.models import dna_client\n",
78
+ "import numpy as np\n",
79
+ "import pandas as pd\n",
80
+ "from google.colab import userdata"
81
+ ],
82
+ "outputs": [],
83
+ "execution_count": 2
84
+ },
85
+ {
86
+ "metadata": {
87
+ "id": "dzkwq2tyfj0q"
88
+ },
89
+ "cell_type": "markdown",
90
+ "source": [
91
+ "##  Data: model inputs"
92
+ ]
93
+ },
94
+ {
95
+ "metadata": {
96
+ "id": "3qR6e2XtW5IZ"
97
+ },
98
+ "cell_type": "markdown",
99
+ "source": [
100
+ "### Genomic interval\n",
101
+ "\n",
102
+ "A genomic interval is specified using `genome.Interval`:"
103
+ ]
104
+ },
105
+ {
106
+ "metadata": {
107
+ "executionInfo": {
108
+ "elapsed": 57,
109
+ "status": "ok",
110
+ "timestamp": 1749822647220,
111
+ "user": {
112
+ "displayName": "",
113
+ "userId": ""
114
+ },
115
+ "user_tz": -60
116
+ },
117
+ "id": "XIZHnO32W4Hn"
118
+ },
119
+ "cell_type": "code",
120
+ "source": [
121
+ "interval = genome.Interval(chromosome='chr1', start=1_000, end=1_010)"
122
+ ],
123
+ "outputs": [],
124
+ "execution_count": 3
125
+ },
126
+ {
127
+ "metadata": {
128
+ "id": "Qn9x1ArcXLVI"
129
+ },
130
+ "cell_type": "markdown",
131
+ "source": [
132
+ "By default, these are human hg38 intervals. See the\n",
133
+ "[FAQ](https://www.alphagenomedocs.com/faqs.html#what-are-the-reference-genome-versions-used-by-the-model) for more\n",
134
+ "details on organisms and genome versions.\n"
135
+ ]
136
+ },
137
+ {
138
+ "metadata": {
139
+ "id": "PCGNRUfHXOL1"
140
+ },
141
+ "cell_type": "markdown",
142
+ "source": [
143
+ "#### Interval properties\n",
144
+ "\n",
145
+ "Access some handy properties of the interval:\n"
146
+ ]
147
+ },
148
+ {
149
+ "metadata": {
150
+ "executionInfo": {
151
+ "elapsed": 66,
152
+ "status": "ok",
153
+ "timestamp": 1749822647565,
154
+ "user": {
155
+ "displayName": "",
156
+ "userId": ""
157
+ },
158
+ "user_tz": -60
159
+ },
160
+ "id": "8bn73Lm3XL1C",
161
+ "outputId": "f872b5f0-51bd-4455-9ec6-96c3d19a5c1d"
162
+ },
163
+ "cell_type": "code",
164
+ "source": [
165
+ "interval.center()"
166
+ ],
167
+ "outputs": [
168
+ {
169
+ "data": {
170
+ "text/plain": [
171
+ "1005"
172
+ ]
173
+ },
174
+ "execution_count": 4,
175
+ "metadata": {},
176
+ "output_type": "execute_result"
177
+ }
178
+ ],
179
+ "execution_count": 4
180
+ },
181
+ {
182
+ "metadata": {
183
+ "executionInfo": {
184
+ "elapsed": 56,
185
+ "status": "ok",
186
+ "timestamp": 1749822647918,
187
+ "user": {
188
+ "displayName": "",
189
+ "userId": ""
190
+ },
191
+ "user_tz": -60
192
+ },
193
+ "id": "fJVk-ocQXWhm",
194
+ "outputId": "d0203696-36b6-4ca5-a204-f417284f2ca7"
195
+ },
196
+ "cell_type": "code",
197
+ "source": [
198
+ "interval.width"
199
+ ],
200
+ "outputs": [
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "10"
205
+ ]
206
+ },
207
+ "execution_count": 5,
208
+ "metadata": {},
209
+ "output_type": "execute_result"
210
+ }
211
+ ],
212
+ "execution_count": 5
213
+ },
214
+ {
215
+ "metadata": {
216
+ "id": "BvlmfYgBXXig"
217
+ },
218
+ "cell_type": "markdown",
219
+ "source": [
220
+ "#### Resize\n",
221
+ "\n",
222
+ "Use `genome.Interval.resize` to resize the interval\n",
223
+ "around its center point:"
224
+ ]
225
+ },
226
+ {
227
+ "metadata": {
228
+ "executionInfo": {
229
+ "elapsed": 64,
230
+ "status": "ok",
231
+ "timestamp": 1749822648252,
232
+ "user": {
233
+ "displayName": "",
234
+ "userId": ""
235
+ },
236
+ "user_tz": -60
237
+ },
238
+ "id": "y72ZqANrXehY",
239
+ "outputId": "5c001c60-01db-4807-8508-a795a88dc629"
240
+ },
241
+ "cell_type": "code",
242
+ "source": [
243
+ "interval.resize(100)"
244
+ ],
245
+ "outputs": [
246
+ {
247
+ "data": {
248
+ "text/plain": [
249
+ "Interval(chromosome='chr1', start=955, end=1055, strand='.', name='')"
250
+ ]
251
+ },
252
+ "execution_count": 6,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ }
256
+ ],
257
+ "execution_count": 6
258
+ },
259
+ {
260
+ "metadata": {
261
+ "id": "ZrM4rMJDXkNF"
262
+ },
263
+ "cell_type": "markdown",
264
+ "source": [
265
+ "#### Compare intervals\n",
266
+ "\n",
267
+ "We can also check the interval's relationship to other intervals:"
268
+ ]
269
+ },
270
+ {
271
+ "metadata": {
272
+ "executionInfo": {
273
+ "elapsed": 68,
274
+ "status": "ok",
275
+ "timestamp": 1749822648622,
276
+ "user": {
277
+ "displayName": "",
278
+ "userId": ""
279
+ },
280
+ "user_tz": -60
281
+ },
282
+ "id": "Ye04nJETXmBL"
283
+ },
284
+ "cell_type": "code",
285
+ "source": [
286
+ "second_interval = genome.Interval(chromosome='chr1', start=1_005, end=1_015)"
287
+ ],
288
+ "outputs": [],
289
+ "execution_count": 7
290
+ },
291
+ {
292
+ "metadata": {
293
+ "executionInfo": {
294
+ "elapsed": 62,
295
+ "status": "ok",
296
+ "timestamp": 1749822648949,
297
+ "user": {
298
+ "displayName": "",
299
+ "userId": ""
300
+ },
301
+ "user_tz": -60
302
+ },
303
+ "id": "9yecEDzAXpIS",
304
+ "outputId": "d5a0d62c-4d96-4945-c33b-9566f5c9d1b0"
305
+ },
306
+ "cell_type": "code",
307
+ "source": [
308
+ "interval.overlaps(second_interval)"
309
+ ],
310
+ "outputs": [
311
+ {
312
+ "data": {
313
+ "text/plain": [
314
+ "True"
315
+ ]
316
+ },
317
+ "execution_count": 8,
318
+ "metadata": {},
319
+ "output_type": "execute_result"
320
+ }
321
+ ],
322
+ "execution_count": 8
323
+ },
324
+ {
325
+ "metadata": {
326
+ "executionInfo": {
327
+ "elapsed": 61,
328
+ "status": "ok",
329
+ "timestamp": 1749822649266,
330
+ "user": {
331
+ "displayName": "",
332
+ "userId": ""
333
+ },
334
+ "user_tz": -60
335
+ },
336
+ "id": "tMN-FGXZXsqr",
337
+ "outputId": "32d4de71-3d1c-4965-91c2-8f82b09d9aab"
338
+ },
339
+ "cell_type": "code",
340
+ "source": [
341
+ "interval.contains(second_interval)"
342
+ ],
343
+ "outputs": [
344
+ {
345
+ "data": {
346
+ "text/plain": [
347
+ "False"
348
+ ]
349
+ },
350
+ "execution_count": 9,
351
+ "metadata": {},
352
+ "output_type": "execute_result"
353
+ }
354
+ ],
355
+ "execution_count": 9
356
+ },
357
+ {
358
+ "metadata": {
359
+ "executionInfo": {
360
+ "elapsed": 330,
361
+ "status": "ok",
362
+ "timestamp": 1749822649862,
363
+ "user": {
364
+ "displayName": "",
365
+ "userId": ""
366
+ },
367
+ "user_tz": -60
368
+ },
369
+ "id": "sDjWXjJYXuPB",
370
+ "outputId": "f43f4e1b-e319-44c7-d6d9-68b5384d579c"
371
+ },
372
+ "cell_type": "code",
373
+ "source": [
374
+ "interval.intersect(second_interval)"
375
+ ],
376
+ "outputs": [
377
+ {
378
+ "data": {
379
+ "text/plain": [
380
+ "Interval(chromosome='chr1', start=1005, end=1010, strand='.', name='')"
381
+ ]
382
+ },
383
+ "execution_count": 10,
384
+ "metadata": {},
385
+ "output_type": "execute_result"
386
+ }
387
+ ],
388
+ "execution_count": 10
389
+ },
390
+ {
391
+ "metadata": {
392
+ "id": "X0U15RKjXwZL"
393
+ },
394
+ "cell_type": "markdown",
395
+ "source": [
396
+ "As a subtle point, AlphaGenome classes use 0-based indexing, meaning that the\n",
397
+ "interval includes the base pair at the `start` position up to the base pair at\n",
398
+ "the `end-1` position. See the [FAQ](https://www.alphagenomedocs.com/faqs.html#how-do-i-specify-a-genomic-region)\n",
399
+ "for more on this topic."
400
+ ]
401
+ },
402
+ {
403
+ "metadata": {
404
+ "id": "dIDUCQKOX1Vj"
405
+ },
406
+ "cell_type": "markdown",
407
+ "source": [
408
+ "### Genomic variant\n",
409
+ "\n",
410
+ "A `genome.Variant` specifies a genetic variant:\n"
411
+ ]
412
+ },
413
+ {
414
+ "metadata": {
415
+ "executionInfo": {
416
+ "elapsed": 54,
417
+ "status": "ok",
418
+ "timestamp": 1749822650248,
419
+ "user": {
420
+ "displayName": "",
421
+ "userId": ""
422
+ },
423
+ "user_tz": -60
424
+ },
425
+ "id": "R_D6AoKFXyBJ"
426
+ },
427
+ "cell_type": "code",
428
+ "source": [
429
+ "variant = genome.Variant(\n",
430
+ " chromosome='chr3', position=10_000, reference_bases='A', alternate_bases='C'\n",
431
+ ")"
432
+ ],
433
+ "outputs": [],
434
+ "execution_count": 11
435
+ },
436
+ {
437
+ "metadata": {
438
+ "id": "L9QcFhogX693"
439
+ },
440
+ "cell_type": "markdown",
441
+ "source": [
442
+ "This variant changes the base `A` to a `C` at position 10\\_000 on chromosome 3\\.\n",
443
+ "Note that the `position` attribute is 1-based to maintain compatibility with\n",
444
+ "common public variant formats (see [FAQ](https://www.alphagenomedocs.com/faqs.html#how-do-i-define-a-variant) for more\n",
445
+ "info.)"
446
+ ]
447
+ },
448
+ {
449
+ "metadata": {
450
+ "id": "l6SRhPTrYKY3"
451
+ },
452
+ "cell_type": "markdown",
453
+ "source": [
454
+ "#### Insertions or deletions (indels)\n",
455
+ "\n",
456
+ "Variants can also be larger than a single base, such as insertions or deletions:"
457
+ ]
458
+ },
459
+ {
460
+ "metadata": {
461
+ "executionInfo": {
462
+ "elapsed": 56,
463
+ "status": "ok",
464
+ "timestamp": 1749822650560,
465
+ "user": {
466
+ "displayName": "",
467
+ "userId": ""
468
+ },
469
+ "user_tz": -60
470
+ },
471
+ "id": "PMmSYhSfX9K9"
472
+ },
473
+ "cell_type": "code",
474
+ "source": [
475
+ "# Insertion variant.\n",
476
+ "variant = genome.Variant(\n",
477
+ " chromosome='chr3',\n",
478
+ " position=10_000,\n",
479
+ " reference_bases='T',\n",
480
+ " alternate_bases='CGTCAAT',\n",
481
+ ")\n",
482
+ "\n",
483
+ "# Deletion variant.\n",
484
+ "variant = genome.Variant(\n",
485
+ " chromosome='chr3',\n",
486
+ " position=10_000,\n",
487
+ " reference_bases='AGGGATC',\n",
488
+ " alternate_bases='C',\n",
489
+ ")"
490
+ ],
491
+ "outputs": [],
492
+ "execution_count": 12
493
+ },
494
+ {
495
+ "metadata": {
496
+ "id": "_v9VY9kqYRoP"
497
+ },
498
+ "cell_type": "markdown",
499
+ "source": [
500
+ "The sequence we pass for the `reference_bases` argument could differ from what\n",
501
+ "is actually at that location in the hg38 reference genome. The model will insert\n",
502
+ "whatever is passed as the reference and alternate bases into the sequence and\n",
503
+ "make predictions on them."
504
+ ]
505
+ },
506
+ {
507
+ "metadata": {
508
+ "id": "za3OTKasYYlb"
509
+ },
510
+ "cell_type": "markdown",
511
+ "source": [
512
+ "#### Reference interval\n",
513
+ "\n",
514
+ "We can get the `genome.Interval` corresponding to the\n",
515
+ "reference bases of the variant using `genome.Variant.reference_interval`:"
516
+ ]
517
+ },
518
+ {
519
+ "metadata": {
520
+ "executionInfo": {
521
+ "elapsed": 62,
522
+ "status": "ok",
523
+ "timestamp": 1749822650873,
524
+ "user": {
525
+ "displayName": "",
526
+ "userId": ""
527
+ },
528
+ "user_tz": -60
529
+ },
530
+ "id": "UUyBaqWtYfc0",
531
+ "outputId": "cb3fc331-604d-4d58-9802-f4491dfe9bb2"
532
+ },
533
+ "cell_type": "code",
534
+ "source": [
535
+ "variant = genome.Variant(\n",
536
+ " chromosome='chr3', position=10_000, reference_bases='A', alternate_bases='T'\n",
537
+ ")\n",
538
+ "\n",
539
+ "variant.reference_interval"
540
+ ],
541
+ "outputs": [
542
+ {
543
+ "data": {
544
+ "text/plain": [
545
+ "Interval(chromosome='chr3', start=9999, end=10000, strand='.', name='')"
546
+ ]
547
+ },
548
+ "execution_count": 13,
549
+ "metadata": {},
550
+ "output_type": "execute_result"
551
+ }
552
+ ],
553
+ "execution_count": 13
554
+ },
555
+ {
556
+ "metadata": {
557
+ "id": "DATTTBK6YiEZ"
558
+ },
559
+ "cell_type": "markdown",
560
+ "source": [
561
+ "A common use-case is to make predictions in a genome region around a variant,\n",
562
+ "which involves resizing the\n",
563
+ "`genome.Variant.reference_interval` to a sequence\n",
564
+ "length compatible with AlphaGenome:"
565
+ ]
566
+ },
567
+ {
568
+ "metadata": {
569
+ "executionInfo": {
570
+ "elapsed": 81,
571
+ "status": "ok",
572
+ "timestamp": 1749822651223,
573
+ "user": {
574
+ "displayName": "",
575
+ "userId": ""
576
+ },
577
+ "user_tz": -60
578
+ },
579
+ "id": "YijroJwOYnfU",
580
+ "outputId": "f923811e-b6c8-4aca-bc5a-d821377557d3"
581
+ },
582
+ "cell_type": "code",
583
+ "source": [
584
+ "input_interval = variant.reference_interval.resize(\n",
585
+ " dna_client.SEQUENCE_LENGTH_1MB\n",
586
+ ")\n",
587
+ "input_interval.width"
588
+ ],
589
+ "outputs": [
590
+ {
591
+ "data": {
592
+ "text/plain": [
593
+ "1048576"
594
+ ]
595
+ },
596
+ "execution_count": 14,
597
+ "metadata": {},
598
+ "output_type": "execute_result"
599
+ }
600
+ ],
601
+ "execution_count": 14
602
+ },
603
+ {
604
+ "metadata": {
605
+ "id": "qGIpFPd_YmYc"
606
+ },
607
+ "cell_type": "markdown",
608
+ "source": [
609
+ "#### Overlap with interval\n",
610
+ "\n",
611
+ "We can also check if a variant’s reference or alternate alleles overlap an `genome.Interval`:"
612
+ ]
613
+ },
614
+ {
615
+ "metadata": {
616
+ "executionInfo": {
617
+ "elapsed": 128,
618
+ "status": "ok",
619
+ "timestamp": 1749822651618,
620
+ "user": {
621
+ "displayName": "",
622
+ "userId": ""
623
+ },
624
+ "user_tz": -60
625
+ },
626
+ "id": "HmiX9TELYxTD",
627
+ "outputId": "3e267c68-30dd-4aaf-e316-2a87df03d96e"
628
+ },
629
+ "cell_type": "code",
630
+ "source": [
631
+ "variant = genome.Variant(\n",
632
+ " chromosome='chr3',\n",
633
+ " position=10_000,\n",
634
+ " reference_bases='T',\n",
635
+ " alternate_bases='CGTCAAT',\n",
636
+ ")\n",
637
+ "\n",
638
+ "interval = genome.Interval(chromosome='chr3', start=10_005, end=10_010)\n",
639
+ "\n",
640
+ "print('Reference overlaps:', variant.reference_overlaps(interval))\n",
641
+ "print('Alternative overlaps:', variant.alternate_overlaps(interval))"
642
+ ],
643
+ "outputs": [
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ "Reference overlaps: False\n",
649
+ "Alternative overlaps: True\n"
650
+ ]
651
+ }
652
+ ],
653
+ "execution_count": 15
654
+ },
655
+ {
656
+ "metadata": {
657
+ "id": "Kr0329VfZAZo"
658
+ },
659
+ "cell_type": "markdown",
660
+ "source": [
661
+ "##  Data: model outputs\n",
662
+ "\n",
663
+ "### Track data\n",
664
+ "\n",
665
+ "\u003ca href=\"https://services.google.com/fh/files/misc/trackdata.png\"\u003e\u003cimg src=\"https://services.google.com/fh/files/misc/trackdata.png\" alt=\"anndata\" border=\"0\" height=500\u003e\u003c/a\u003e\n",
666
+ "\n",
667
+ "`track_data.TrackData` objects store model predictions.\n",
668
+ "They have the following properties (using `tdata` as an example of a\n",
669
+ "`track_data.TrackData` object):\n",
670
+ "\n",
671
+ "* `tdata.values` store track predictions as a `numpy.ndarray` .\n",
672
+ "* `tdata.metadata` stores track metadata as a `pandas.DataFrame`. For\n",
673
+ " each track in the predicted values, there will be a corresponding row in the\n",
674
+ " track metadata describing its origin.\n",
675
+ "* `tdata.uns` contains additional unstructured metadata as a `dict`."
676
+ ]
677
+ },
678
+ {
679
+ "metadata": {
680
+ "id": "I3HTL-NTZR6n"
681
+ },
682
+ "cell_type": "markdown",
683
+ "source": [
684
+ "#### From scratch\n",
685
+ "\n",
686
+ "You can create your own `track_data.TrackData` object\n",
687
+ "from scratch by specifying the values and metadata manually. The metadata must\n",
688
+ "contain at least the columns name (the names of the tracks) and strand (the\n",
689
+ "strands of DNA that the tracks are on):\n"
690
+ ]
691
+ },
692
+ {
693
+ "metadata": {
694
+ "executionInfo": {
695
+ "elapsed": 58,
696
+ "status": "ok",
697
+ "timestamp": 1749822651961,
698
+ "user": {
699
+ "displayName": "",
700
+ "userId": ""
701
+ },
702
+ "user_tz": -60
703
+ },
704
+ "id": "uwsbveEvZBbt"
705
+ },
706
+ "cell_type": "code",
707
+ "source": [
708
+ "from alphagenome.data import track_data\n",
709
+ "\n",
710
+ "# Array has shape (4,3) -\u003e sequence is length 4 and there are 3 tracks.\n",
711
+ "values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(\n",
712
+ " np.float32\n",
713
+ ")\n",
714
+ "\n",
715
+ "# We have both the positive and negative strand values for track1, while track2\n",
716
+ "# contains unstranded data.\n",
717
+ "metadata = pd.DataFrame({\n",
718
+ " 'name': ['track1', 'track1', 'track2'],\n",
719
+ " 'strand': ['+', '-', '.'],\n",
720
+ "})\n",
721
+ "\n",
722
+ "tdata = track_data.TrackData(values=values, metadata=metadata)"
723
+ ],
724
+ "outputs": [],
725
+ "execution_count": 16
726
+ },
727
+ {
728
+ "metadata": {
729
+ "id": "qEc4ioZVZgR0"
730
+ },
731
+ "cell_type": "markdown",
732
+ "source": [
733
+ "#### Resolution\n",
734
+ "\n",
735
+ "It’s also useful to specify the resolution of the tracks and the genomic\n",
736
+ "interval that they come from, if you have this information available:\n"
737
+ ]
738
+ },
739
+ {
740
+ "metadata": {
741
+ "executionInfo": {
742
+ "elapsed": 55,
743
+ "status": "ok",
744
+ "timestamp": 1749822652304,
745
+ "user": {
746
+ "displayName": "",
747
+ "userId": ""
748
+ },
749
+ "user_tz": -60
750
+ },
751
+ "id": "JGnuY6g_ZaBL"
752
+ },
753
+ "cell_type": "code",
754
+ "source": [
755
+ "interval = genome.Interval(chromosome='chr1', start=1_000, end=1_004)\n",
756
+ "\n",
757
+ "tdata = track_data.TrackData(\n",
758
+ " values=values, metadata=metadata, resolution=1, interval=interval\n",
759
+ ")"
760
+ ],
761
+ "outputs": [],
762
+ "execution_count": 17
763
+ },
764
+ {
765
+ "metadata": {
766
+ "id": "QeN8D_yGZlVB"
767
+ },
768
+ "cell_type": "markdown",
769
+ "source": [
770
+ "Note that the length of the values has to match up with the interval width and\n",
771
+ "resolution. Here is an example specifying that the values actually represent\n",
772
+ "128bp resolution tracks (i.e., each number is a summary over 128 base pairs of\n",
773
+ "DNA):\n"
774
+ ]
775
+ },
776
+ {
777
+ "metadata": {
778
+ "executionInfo": {
779
+ "elapsed": 56,
780
+ "status": "ok",
781
+ "timestamp": 1749822652621,
782
+ "user": {
783
+ "displayName": "",
784
+ "userId": ""
785
+ },
786
+ "user_tz": -60
787
+ },
788
+ "id": "I-iPD_ZGZjMI"
789
+ },
790
+ "cell_type": "code",
791
+ "source": [
792
+ "interval = genome.Interval(chromosome='chr1', start=1_000, end=1_512)\n",
793
+ "\n",
794
+ "tdata = track_data.TrackData(\n",
795
+ " values=values, metadata=metadata, resolution=128, interval=interval\n",
796
+ ")"
797
+ ],
798
+ "outputs": [],
799
+ "execution_count": 18
800
+ },
801
+ {
802
+ "metadata": {
803
+ "id": "HhVG6w41ZqEp"
804
+ },
805
+ "cell_type": "markdown",
806
+ "source": [
807
+ "####  Converting between resolutions\n",
808
+ "\n",
809
+ "We can also interconvert between resolutions. For example, given 1bp resolution\n",
810
+ "predictions, we can downsample the resolution (by summing adjacent values) and\n",
811
+ "return a sequence of length 2:"
812
+ ]
813
+ },
814
+ {
815
+ "metadata": {
816
+ "executionInfo": {
817
+ "elapsed": 64,
818
+ "status": "ok",
819
+ "timestamp": 1749822652955,
820
+ "user": {
821
+ "displayName": "",
822
+ "userId": ""
823
+ },
824
+ "user_tz": -60
825
+ },
826
+ "id": "Ce41uPkcZoWd",
827
+ "outputId": "38adb4cb-7a38-4cc3-8f12-af6ea95dfc0b"
828
+ },
829
+ "cell_type": "code",
830
+ "source": [
831
+ "interval = genome.Interval(chromosome='chr1', start=1_000, end=1_004)\n",
832
+ "\n",
833
+ "tdata = track_data.TrackData(\n",
834
+ " values=values, metadata=metadata, resolution=1, interval=interval\n",
835
+ ")\n",
836
+ "\n",
837
+ "tdata = tdata.change_resolution(resolution=2)\n",
838
+ "tdata.values"
839
+ ],
840
+ "outputs": [
841
+ {
842
+ "data": {
843
+ "text/plain": [
844
+ "array([[ 3., 5., 7.],\n",
845
+ " [15., 17., 19.]], dtype=float32)"
846
+ ]
847
+ },
848
+ "execution_count": 19,
849
+ "metadata": {},
850
+ "output_type": "execute_result"
851
+ }
852
+ ],
853
+ "execution_count": 19
854
+ },
855
+ {
856
+ "metadata": {
857
+ "id": "23Ehe3duZqC3"
858
+ },
859
+ "cell_type": "markdown",
860
+ "source": [
861
+ "We can also upsample track data to get back to 1bp resolution and a sequence of\n",
862
+ "length 4 by repeating values while preserving the sum:"
863
+ ]
864
+ },
865
+ {
866
+ "metadata": {
867
+ "executionInfo": {
868
+ "elapsed": 65,
869
+ "status": "ok",
870
+ "timestamp": 1749822653280,
871
+ "user": {
872
+ "displayName": "",
873
+ "userId": ""
874
+ },
875
+ "user_tz": -60
876
+ },
877
+ "id": "dDi9OWWWZzq_",
878
+ "outputId": "51f3878b-d71e-48f7-f6c7-7a87830704ed"
879
+ },
880
+ "cell_type": "code",
881
+ "source": [
882
+ "tdata = tdata.change_resolution(resolution=1)\n",
883
+ "tdata.values"
884
+ ],
885
+ "outputs": [
886
+ {
887
+ "data": {
888
+ "text/plain": [
889
+ "array([[1.5, 2.5, 3.5],\n",
890
+ " [1.5, 2.5, 3.5],\n",
891
+ " [7.5, 8.5, 9.5],\n",
892
+ " [7.5, 8.5, 9.5]], dtype=float32)"
893
+ ]
894
+ },
895
+ "execution_count": 20,
896
+ "metadata": {},
897
+ "output_type": "execute_result"
898
+ }
899
+ ],
900
+ "execution_count": 20
901
+ },
902
+ {
903
+ "metadata": {
904
+ "id": "Rkne2GEwZ82L"
905
+ },
906
+ "cell_type": "markdown",
907
+ "source": [
908
+ "####  Filtering\n",
909
+ "\n",
910
+ "`track_data.TrackData` objects can be filtered by the\n",
911
+ "type of DNA strand the tracks are on:\n"
912
+ ]
913
+ },
914
+ {
915
+ "metadata": {
916
+ "executionInfo": {
917
+ "elapsed": 71,
918
+ "status": "ok",
919
+ "timestamp": 1749822653638,
920
+ "user": {
921
+ "displayName": "",
922
+ "userId": ""
923
+ },
924
+ "user_tz": -60
925
+ },
926
+ "id": "RoIaGXLtaBbz",
927
+ "outputId": "75db5fc1-a5a6-4798-e26f-9d62443abbeb"
928
+ },
929
+ "cell_type": "code",
930
+ "source": [
931
+ "print(\n",
932
+ " 'Positive strand tracks:',\n",
933
+ " tdata.filter_to_positive_strand().metadata.name.values,\n",
934
+ ")\n",
935
+ "print(\n",
936
+ " 'Negative strand tracks:',\n",
937
+ " tdata.filter_to_negative_strand().metadata.name.values,\n",
938
+ ")\n",
939
+ "print('Unstranded tracks:', tdata.filter_to_unstranded().metadata.name.values)"
940
+ ],
941
+ "outputs": [
942
+ {
943
+ "name": "stdout",
944
+ "output_type": "stream",
945
+ "text": [
946
+ "Positive strand tracks: ['track1']\n",
947
+ "Negative strand tracks: ['track1']\n",
948
+ "Unstranded tracks: ['track2']\n"
949
+ ]
950
+ }
951
+ ],
952
+ "execution_count": 21
953
+ },
954
+ {
955
+ "metadata": {
956
+ "id": "xogO-bWVaatH"
957
+ },
958
+ "cell_type": "markdown",
959
+ "source": [
960
+ "#### Resizing\n",
961
+ "\n",
962
+ "We can resize the `track_data.TrackData` to be either\n",
963
+ "smaller (by cropping):"
964
+ ]
965
+ },
966
+ {
967
+ "metadata": {
968
+ "executionInfo": {
969
+ "elapsed": 61,
970
+ "status": "ok",
971
+ "timestamp": 1749822654145,
972
+ "user": {
973
+ "displayName": "",
974
+ "userId": ""
975
+ },
976
+ "user_tz": -60
977
+ },
978
+ "id": "clj536M9abBp",
979
+ "outputId": "f7819f36-70c7-4b78-cb04-b0e2e030ff6e"
980
+ },
981
+ "cell_type": "code",
982
+ "source": [
983
+ "# Re-instantiating the original trackdata.\n",
984
+ "tdata = track_data.TrackData(\n",
985
+ " values=values, metadata=metadata, resolution=1, interval=interval\n",
986
+ ")\n",
987
+ "\n",
988
+ "# Resize from width (sequence length) of 4 down to 2.\n",
989
+ "tdata.resize(width=2).values"
990
+ ],
991
+ "outputs": [
992
+ {
993
+ "data": {
994
+ "text/plain": [
995
+ "array([[3., 4., 5.],\n",
996
+ " [6., 7., 8.]], dtype=float32)"
997
+ ]
998
+ },
999
+ "execution_count": 22,
1000
+ "metadata": {},
1001
+ "output_type": "execute_result"
1002
+ }
1003
+ ],
1004
+ "execution_count": 22
1005
+ },
1006
+ {
1007
+ "metadata": {
1008
+ "id": "wl3aNKu-akaO"
1009
+ },
1010
+ "cell_type": "markdown",
1011
+ "source": [
1012
+ "Or bigger (by padding with zeros):\n"
1013
+ ]
1014
+ },
1015
+ {
1016
+ "metadata": {
1017
+ "executionInfo": {
1018
+ "elapsed": 61,
1019
+ "status": "ok",
1020
+ "timestamp": 1749822654456,
1021
+ "user": {
1022
+ "displayName": "",
1023
+ "userId": ""
1024
+ },
1025
+ "user_tz": -60
1026
+ },
1027
+ "id": "pHWZRppBaeC1",
1028
+ "outputId": "087a212d-ca6f-4490-fe59-408e9a157ee2"
1029
+ },
1030
+ "cell_type": "code",
1031
+ "source": [
1032
+ "tdata.resize(width=8).values"
1033
+ ],
1034
+ "outputs": [
1035
+ {
1036
+ "data": {
1037
+ "text/plain": [
1038
+ "array([[ 0., 0., 0.],\n",
1039
+ " [ 0., 0., 0.],\n",
1040
+ " [ 0., 1., 2.],\n",
1041
+ " [ 3., 4., 5.],\n",
1042
+ " [ 6., 7., 8.],\n",
1043
+ " [ 9., 10., 11.],\n",
1044
+ " [ 0., 0., 0.],\n",
1045
+ " [ 0., 0., 0.]], dtype=float32)"
1046
+ ]
1047
+ },
1048
+ "execution_count": 23,
1049
+ "metadata": {},
1050
+ "output_type": "execute_result"
1051
+ }
1052
+ ],
1053
+ "execution_count": 23
1054
+ },
1055
+ {
1056
+ "metadata": {
1057
+ "id": "SRkggBayar2r"
1058
+ },
1059
+ "cell_type": "markdown",
1060
+ "source": [
1061
+ "#### Slicing\n",
1062
+ "\n",
1063
+ "We can slice into specific positions of the\n",
1064
+ "`track_data.TrackData`:"
1065
+ ]
1066
+ },
1067
+ {
1068
+ "metadata": {
1069
+ "executionInfo": {
1070
+ "elapsed": 56,
1071
+ "status": "ok",
1072
+ "timestamp": 1749822654784,
1073
+ "user": {
1074
+ "displayName": "",
1075
+ "userId": ""
1076
+ },
1077
+ "user_tz": -60
1078
+ },
1079
+ "id": "yHcMdzN1amMz",
1080
+ "outputId": "73a293d2-279c-4709-b79c-cfd24235e19d"
1081
+ },
1082
+ "cell_type": "code",
1083
+ "source": [
1084
+ "# Get the final 2 positions only.\n",
1085
+ "print('slice by position: ', tdata.slice_by_positions(start=2, end=4).values)\n",
1086
+ "# Same, but using slice_interval:\n",
1087
+ "print(\n",
1088
+ " 'slice by interval: ',\n",
1089
+ " tdata.slice_by_interval(\n",
1090
+ " genome.Interval(chromosome='chr1', start=1_002, end=1_004)\n",
1091
+ " ).values,\n",
1092
+ ")"
1093
+ ],
1094
+ "outputs": [
1095
+ {
1096
+ "name": "stdout",
1097
+ "output_type": "stream",
1098
+ "text": [
1099
+ "slice by position: [[ 6. 7. 8.]\n",
1100
+ " [ 9. 10. 11.]]\n",
1101
+ "slice by interval: [[ 6. 7. 8.]\n",
1102
+ " [ 9. 10. 11.]]\n"
1103
+ ]
1104
+ }
1105
+ ],
1106
+ "execution_count": 24
1107
+ },
1108
+ {
1109
+ "metadata": {
1110
+ "id": "p1OiFpXGa-Ja"
1111
+ },
1112
+ "cell_type": "markdown",
1113
+ "source": [
1114
+ "#### Subsetting tracks\n",
1115
+ "\n",
1116
+ "Subset (and reorder) to specific track names:"
1117
+ ]
1118
+ },
1119
+ {
1120
+ "metadata": {
1121
+ "executionInfo": {
1122
+ "elapsed": 63,
1123
+ "status": "ok",
1124
+ "timestamp": 1749822655084,
1125
+ "user": {
1126
+ "displayName": "",
1127
+ "userId": ""
1128
+ },
1129
+ "user_tz": -60
1130
+ },
1131
+ "id": "_l9bKsuvaxtu",
1132
+ "outputId": "abed781e-42d7-4c55-85fe-754ac8351905"
1133
+ },
1134
+ "cell_type": "code",
1135
+ "source": [
1136
+ "# Get only tracks with the name 'track1'.\n",
1137
+ "track1_tdata = tdata.select_tracks_by_name(names='track1')\n",
1138
+ "track1_tdata.values"
1139
+ ],
1140
+ "outputs": [
1141
+ {
1142
+ "data": {
1143
+ "text/plain": [
1144
+ "array([[ 0., 1.],\n",
1145
+ " [ 3., 4.],\n",
1146
+ " [ 6., 7.],\n",
1147
+ " [ 9., 10.]], dtype=float32)"
1148
+ ]
1149
+ },
1150
+ "execution_count": 25,
1151
+ "metadata": {},
1152
+ "output_type": "execute_result"
1153
+ }
1154
+ ],
1155
+ "execution_count": 25
1156
+ },
1157
+ {
1158
+ "metadata": {
1159
+ "id": "bdTrjm4pbLIC"
1160
+ },
1161
+ "cell_type": "markdown",
1162
+ "source": [
1163
+ "The metadata gets automatically filtered to `track1` too:\n"
1164
+ ]
1165
+ },
1166
+ {
1167
+ "metadata": {
1168
+ "executionInfo": {
1169
+ "elapsed": 58,
1170
+ "status": "ok",
1171
+ "timestamp": 1749822655417,
1172
+ "user": {
1173
+ "displayName": "",
1174
+ "userId": ""
1175
+ },
1176
+ "user_tz": -60
1177
+ },
1178
+ "id": "aErYz_nRbCID",
1179
+ "outputId": "59679dd0-7f08-44d9-dfb8-3f5a34e7380c"
1180
+ },
1181
+ "cell_type": "code",
1182
+ "source": [
1183
+ "track1_tdata.metadata.name.values"
1184
+ ],
1185
+ "outputs": [
1186
+ {
1187
+ "data": {
1188
+ "text/plain": [
1189
+ "array(['track1', 'track1'], dtype=object)"
1190
+ ]
1191
+ },
1192
+ "execution_count": 26,
1193
+ "metadata": {},
1194
+ "output_type": "execute_result"
1195
+ }
1196
+ ],
1197
+ "execution_count": 26
1198
+ },
1199
+ {
1200
+ "metadata": {
1201
+ "id": "lgprEnpIbPtR"
1202
+ },
1203
+ "cell_type": "markdown",
1204
+ "source": [
1205
+ "Finally, if we pass in a stranded `genome.Interval` or\n",
1206
+ "leave unspecified as `None` when constructing a\n",
1207
+ "`track_data.TrackData`, we can reverse complement\n",
1208
+ "transform our track values in a strand-aware manner:\n"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "metadata": {
1213
+ "executionInfo": {
1214
+ "elapsed": 58,
1215
+ "status": "ok",
1216
+ "timestamp": 1749822655727,
1217
+ "user": {
1218
+ "displayName": "",
1219
+ "userId": ""
1220
+ },
1221
+ "user_tz": -60
1222
+ },
1223
+ "id": "HeZXB4K3bMyJ",
1224
+ "outputId": "1d6250e5-f1c3-471a-b347-c73664d056a5"
1225
+ },
1226
+ "cell_type": "code",
1227
+ "source": [
1228
+ "interval = genome.Interval(\n",
1229
+ " chromosome='chr1', start=1_000, end=1_004, strand='+'\n",
1230
+ ")\n",
1231
+ "\n",
1232
+ "tdata = track_data.TrackData(\n",
1233
+ " values=values, metadata=metadata, resolution=1, interval=interval\n",
1234
+ ")\n",
1235
+ "\n",
1236
+ "tdata.reverse_complement().values"
1237
+ ],
1238
+ "outputs": [
1239
+ {
1240
+ "data": {
1241
+ "text/plain": [
1242
+ "array([[10., 9., 11.],\n",
1243
+ " [ 7., 6., 8.],\n",
1244
+ " [ 4., 3., 5.],\n",
1245
+ " [ 1., 0., 2.]], dtype=float32)"
1246
+ ]
1247
+ },
1248
+ "execution_count": 27,
1249
+ "metadata": {},
1250
+ "output_type": "execute_result"
1251
+ }
1252
+ ],
1253
+ "execution_count": 27
1254
+ },
1255
+ {
1256
+ "metadata": {
1257
+ "id": "Hb8duvxibcXW"
1258
+ },
1259
+ "cell_type": "markdown",
1260
+ "source": [
1261
+ "### Variant scoring output\n",
1262
+ "\n",
1263
+ "# \u003ca href=\"https://services.google.com/fh/files/misc/anndata.png\"\u003e\u003cimg src=\"https://services.google.com/fh/files/misc/anndata.png\" alt=\"anndata\" border=\"0\" height=500\u003e\u003c/a\u003e\n",
1264
+ "\n",
1265
+ "The output of variant scoring is in `anndata.AnnData` format, which is a\n",
1266
+ "way of scoring data together with annotation metadata. Originally developed in\n",
1267
+ "the single-cell RNA-seq field, `anndata.AnnData` is useful when you have\n",
1268
+ "metadata associated with an array of data.\n",
1269
+ "\n",
1270
+ "`anndata.AnnData` objects have the following properties (using\n",
1271
+ "`variant_scores` as an example `anndata.AnnData` object):\n",
1272
+ "\n",
1273
+ "* `variant_scores.X` contains a `numpy.ndarray` containing the variant\n",
1274
+ " scores per each gene in the region. This matrix has shape (`num_genes`,\n",
1275
+ " `num_tracks`), where `num_tracks` is the number of output tracks in your\n",
1276
+ " requested OutputType (such as `RNA_SEQ`, `DNASE`, etc.). Note that if you\n",
1277
+ " did not use a gene-centric scorer, then `variant_scores.X` will have shape\n",
1278
+ " (1, `num_tracks`), reflecting the fact that the variant has a single global\n",
1279
+ " score and not per-gene score.\n",
1280
+ "* `variant_scores.var` contains the track metadata as a\n",
1281
+ " `pandas.DataFrame`. For every track in the scores (`num_genes`,\n",
1282
+ " `num_tracks`), there will be a row in the track metadata explaining the\n",
1283
+ " track (its cell type, strand, etc.).\n",
1284
+ "* `variant_scores.obs` contains the gene metadata as a\n",
1285
+ " `pandas.DataFrame`. Note that the gene metadata is None in the case\n",
1286
+ " of non gene-centric variant scorers.\n",
1287
+ "* `variant_scores.uns` contains some additional unstructured metadata that\n",
1288
+ " logs the origin of the variant scores, namely:\n",
1289
+ " * The `genome.Variant` that was scored\n",
1290
+ " (variant\\_scores.uns\\[‘variant’\\])\n",
1291
+ " * The `genome.Interval` containing the interval\n",
1292
+ " (variant\\_scores.uns\\[‘interval’\\])\n",
1293
+ " * The [`variant scorer`](https://www.alphagenomedocs.com/api/models.html#variant-scorers) that was used to\n",
1294
+ " generate the scores (variant\\_scores.uns\\[‘scorer’\\])"
1295
+ ]
1296
+ },
1297
+ {
1298
+ "metadata": {
1299
+ "id": "wxew4DE0bmLO"
1300
+ },
1301
+ "cell_type": "markdown",
1302
+ "source": [
1303
+ "#### From scratch\n",
1304
+ "\n",
1305
+ "You are unlikely to need to create an `anndata.AnnData` object from\n",
1306
+ "scratch, but just for reference, here is how it would be done:"
1307
+ ]
1308
+ },
1309
+ {
1310
+ "metadata": {
1311
+ "id": "e3jeE3HVbpof"
1312
+ },
1313
+ "cell_type": "code",
1314
+ "source": [
1315
+ "import anndata\n",
1316
+ "import numpy as np\n",
1317
+ "import pandas as pd\n",
1318
+ "\n",
1319
+ "# Creating a small matrix of variant scores (3 genes x 2 tracks).\n",
1320
+ "scores = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])\n",
1321
+ "\n",
1322
+ "gene_metadata = pd.DataFrame({'gene_id': ['ENSG0001', 'ENSG0002', 'ENSG0003']})\n",
1323
+ "\n",
1324
+ "track_metadata = pd.DataFrame(\n",
1325
+ " {'name': ['track1', 'track2'], 'strand': ['+', '-']}\n",
1326
+ ")\n",
1327
+ "\n",
1328
+ "variant_scores = anndata.AnnData(\n",
1329
+ " X=scores, obs=gene_metadata, var=track_metadata\n",
1330
+ ")"
1331
+ ],
1332
+ "outputs": [],
1333
+ "execution_count": null
1334
+ },
1335
+ {
1336
+ "metadata": {
1337
+ "id": "7IHNdc_BbttW"
1338
+ },
1339
+ "cell_type": "markdown",
1340
+ "source": [
1341
+ "## Methods: making predictions\n",
1342
+ "\n",
1343
+ "The main commands for making model predictions are:\n",
1344
+ "\n",
1345
+ "* `dna_client.DnaClient.predict_sequence` to predict\n",
1346
+ " from a raw DNA string\n",
1347
+ "* `dna_client.DnaClient.predict_interval` to predict\n",
1348
+ " from a genome interval (a `genome.Interval`)\n",
1349
+ "* `dna_client.DnaClient.predict_variant` to make\n",
1350
+ " predictions for ref and alt sequences of a variant (a\n",
1351
+ " `genome.Variant` object)\n",
1352
+ "* `dna_client.DnaClient.score_variant` to score the\n",
1353
+ " effects of a variant by comparing ref and alt predictions.\n",
1354
+ "* `dna_client.DnaClient.score_variants` the same as\n",
1355
+ " the above, but for scoring a list of multiple variants.\n"
1356
+ ]
1357
+ },
1358
+ {
1359
+ "metadata": {
1360
+ "id": "lJ17YLKRQ6nq"
1361
+ },
1362
+ "cell_type": "code",
1363
+ "source": [],
1364
+ "outputs": [],
1365
+ "execution_count": null
1366
+ },
1367
+ {
1368
+ "metadata": {
1369
+ "id": "X10M3ojgbw4a"
1370
+ },
1371
+ "cell_type": "markdown",
1372
+ "source": [
1373
+ "## Methods: visualization\n",
1374
+ "\n",
1375
+ "The main command for visualizing model predictions is:\n",
1376
+ "\n",
1377
+ "* `alphagenome.visualization.plot_components.plot`, to turn a list of\n",
1378
+ " of [plot components](https://www.alphagenomedocs.com/api/visualization.html#plot-components) into a\n",
1379
+ " `matplotlib.figure.Figure`.\n",
1380
+ "\n",
1381
+ "See the [visualization basics guide](https://www.alphagenomedocs.com/visualization_library_basics.html) and [visualizing predictions tutorial](https://www.alphagenomedocs.com/colabs/visualization_modality_tour.html) for more details."
1382
+ ]
1383
+ }
1384
+ ],
1385
+ "metadata": {
1386
+ "colab": {
1387
+ "last_runtime": {},
1388
+ "provenance": [
1389
+ {
1390
+ "file_id": "1hJ2uMZ3sA8pu_UvSNikENLECC-X5XrR8",
1391
+ "timestamp": 1749822158925
1392
+ }
1393
+ ]
1394
+ },
1395
+ "kernelspec": {
1396
+ "display_name": "Python 3",
1397
+ "name": "python3"
1398
+ },
1399
+ "language_info": {
1400
+ "name": "python"
1401
+ }
1402
+ },
1403
+ "nbformat": 4,
1404
+ "nbformat_minor": 0
1405
+ }
alphagenome/source/colabs/example_analysis_workflow.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
alphagenome/source/colabs/quick_start.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
alphagenome/source/colabs/tissue_ontology_mapping.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
alphagenome/source/colabs/visualization_modality_tour.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
alphagenome/source/conftest.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Configure FLAGS with default values for absltest."""
16
+
17
+ from absl import app
18
+
19
+ try:
20
+ app.run(lambda argv: None)
21
+ except SystemExit:
22
+ pass
alphagenome/source/docs/Makefile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = source
9
+ BUILDDIR = build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
alphagenome/source/docs/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AlphaGenome API is a service that provides comprehensive and accurate AI
2
+ predictions for genome interpretation.
3
+
4
+ AlphaGenome is a deep learning genomics model that takes a genomic (DNA)
5
+ sequence as input and predicts various molecular properties of DNA & RNA, many
6
+ at single base pair resolution.
alphagenome/source/docs/make.bat ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @rem Copyright 2024 Google LLC.
2
+ @rem
3
+ @rem Licensed under the Apache License, Version 2.0 (the "License");
4
+ @rem you may not use this file except in compliance with the License.
5
+ @rem You may obtain a copy of the License at
6
+ @rem
7
+ @rem http://www.apache.org/licenses/LICENSE-2.0
8
+ @rem
9
+ @rem Unless required by applicable law or agreed to in writing, software
10
+ @rem distributed under the License is distributed on an "AS IS" BASIS,
11
+ @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ @rem See the License for the specific language governing permissions and
13
+ @rem limitations under the License.
14
+
15
+ @ECHO OFF
16
+
17
+ pushd %~dp0
18
+
19
+ REM Command file for Sphinx documentation
20
+
21
+ if "%SPHINXBUILD%" == "" (
22
+ set SPHINXBUILD=sphinx-build
23
+ )
24
+ set SOURCEDIR=source
25
+ set BUILDDIR=build
26
+
27
+ %SPHINXBUILD% >NUL 2>NUL
28
+ if errorlevel 9009 (
29
+ echo.
30
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
31
+ echo.installed, then set the SPHINXBUILD environment variable to point
32
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
33
+ echo.may add the Sphinx directory to PATH.
34
+ echo.
35
+ echo.If you don't have Sphinx installed, grab it from
36
+ echo.https://www.sphinx-doc.org/
37
+ exit /b 1
38
+ )
39
+
40
+ if "%1" == "" goto help
41
+
42
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
43
+ goto end
44
+
45
+ :help
46
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
47
+
48
+ :end
49
+ popd
alphagenome/source/docs/source/_templates/autosummary/class.rst ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ fullname | escape | underline }}
2
+
3
+ .. currentmodule:: {{ module }}
4
+
5
+ .. autoclass:: {{ objname }}
6
+
7
+ {% block attributes %}
8
+ {% if attributes %}
9
+
10
+
11
+ Attributes
12
+ ~~~~~~~~~~
13
+
14
+ .. rubric:: Table
15
+
16
+ .. autosummary::
17
+ {% for item in attributes %}
18
+ {%- if item not in inherited_members%}
19
+ ~{{ name }}.{{ item }}
20
+ {%- endif -%}
21
+ {%- endfor %}
22
+
23
+ {% for item in attributes %}
24
+ .. autoattribute:: {{ [objname, item] | join(".") }}
25
+ {%- endfor %}
26
+
27
+ {% endif %}
28
+ {% endblock %}
29
+
30
+ {% block methods %}
31
+
32
+ {% if methods %}
33
+
34
+ Methods
35
+ ~~~~~~~
36
+
37
+ .. rubric:: Table
38
+
39
+ .. autosummary::
40
+ {% for item in methods %}
41
+ {%- if item != '__init__' %}
42
+ ~{{ name }}.{{ item }}
43
+ {%- endif -%}
44
+
45
+ {%- endfor %}
46
+
47
+ {% for item in methods %}
48
+ {%- if item != '__init__'%}
49
+ .. automethod:: {{ [objname, item] | join(".") }}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {% endif %}
53
+
54
+ {% endblock %}
55
+
alphagenome/source/docs/source/api/data.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data
2
+
3
+ Classes and utilities for manipulating genomics data.
4
+
5
+ ## Fold Intervals
6
+
7
+ ``` {eval-rst}
8
+ .. currentmodule:: alphagenome
9
+ ```
10
+
11
+ ``` {eval-rst}
12
+
13
+ .. autosummary::
14
+ :toctree: generated
15
+
16
+ data.fold_intervals.Subset
17
+ data.fold_intervals.get_all_folds
18
+ data.fold_intervals.get_fold_names
19
+ data.fold_intervals.get_fold_intervals
20
+ ```
21
+
22
+ ## Genome
23
+
24
+ ``` {eval-rst}
25
+ .. currentmodule:: alphagenome
26
+ ```
27
+
28
+ ``` {eval-rst}
29
+
30
+ .. autosummary::
31
+ :toctree: generated
32
+
33
+ data.genome.Strand
34
+ data.genome.Interval
35
+ data.genome.Variant
36
+ data.genome.Junction
37
+ ```
38
+
39
+ ## Gene annotation
40
+
41
+ ``` {eval-rst}
42
+
43
+ .. autosummary::
44
+ :toctree: generated
45
+
46
+ data.gene_annotation.TranscriptType
47
+ data.gene_annotation.extract_tss
48
+ data.gene_annotation.filter_transcript_type
49
+ data.gene_annotation.filter_protein_coding
50
+ data.gene_annotation.filter_to_longest_transcript
51
+ data.gene_annotation.filter_transcript_support_level
52
+ ```
53
+
54
+ ## Ontology
55
+
56
+ ``` {eval-rst}
57
+
58
+ .. autosummary::
59
+ :toctree: generated
60
+
61
+ data.ontology.OntologyType
62
+ data.ontology.OntologyTerm
63
+ ```
64
+
65
+ ## Track data
66
+
67
+ ``` {eval-rst}
68
+
69
+ .. autosummary::
70
+ :toctree: generated
71
+
72
+ data.track_data.TrackData
73
+ data.track_data.concat
74
+ data.track_data.interleave
75
+ data.track_data.metadata_to_proto
76
+ data.track_data.metadata_from_proto
77
+ data.track_data.from_protos
78
+ ```
79
+
80
+ ## Transcript
81
+
82
+ ``` {eval-rst}
83
+
84
+ .. autosummary::
85
+ :toctree: generated
86
+
87
+ data.transcript.Transcript
88
+ data.transcript.TranscriptExtractor
89
+ ```
alphagenome/source/docs/source/api/index.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API
2
+
3
+ ``` {toctree}
4
+ :maxdepth: 1
5
+ :hidden:
6
+
7
+ data
8
+ models
9
+ interpretation
10
+ visualization
11
+ ```
12
+
13
+ <!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
14
+
15
+ ::::{grid} 1 1 2 3
16
+ :gutter: 2
17
+
18
+ :::{grid-item-card} Data
19
+ :link: data
20
+ :link-type: doc
21
+
22
+ Classes and utilities for manipulating genomics data.
23
+ :::
24
+
25
+ :::{grid-item-card} Models
26
+ :link: models
27
+ :link-type: doc
28
+
29
+ AlphaGenome client and variant scorers.
30
+ :::
31
+
32
+ :::{grid-item-card} Interpretation
33
+ :link: interpretation
34
+ :link-type: doc
35
+
36
+ Sequence interpretation tools (like in silico mutagenesis).
37
+ :::
38
+
39
+ :::{grid-item-card} Visualization
40
+ :link: visualization
41
+ :link-type: doc
42
+
43
+ Visualization and plotting tools.
44
+ :::
45
+
46
+ <!-- mdformat on -->
alphagenome/source/docs/source/api/interpretation.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Interpretation
2
+
3
+ Sequence interpretation tools (like in silico mutagenesis).
4
+
5
+ ## ISM
6
+
7
+ ``` {eval-rst}
8
+ .. module:: alphagenome.interpretation
9
+ .. currentmodule:: alphagenome
10
+
11
+ .. autosummary::
12
+ :toctree: generated
13
+
14
+ interpretation.ism.ism_variants
15
+ interpretation.ism.ism_matrix
16
+ ```
alphagenome/source/docs/source/api/models.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models
2
+
3
+ AlphaGenome client and variant scorers.
4
+
5
+ ## DNA Client
6
+
7
+ ``` {eval-rst}
8
+ .. currentmodule:: alphagenome
9
+ ```
10
+
11
+ ``` {eval-rst}
12
+
13
+ .. autosummary::
14
+ :toctree: generated
15
+
16
+ models.dna_client.create
17
+ models.dna_client.ModelVersion
18
+ models.dna_client.Organism
19
+ models.dna_client.validate_sequence_length
20
+ models.dna_client.DnaClient
21
+ ```
22
+
23
+ ## DNA Output
24
+
25
+ ``` {eval-rst}
26
+
27
+ .. autosummary::
28
+ :toctree: generated
29
+
30
+ models.dna_output.OutputType
31
+ models.dna_output.Output
32
+ models.dna_output.OutputMetadata
33
+ models.dna_output.VariantOutput
34
+ ```
35
+
36
+ ## Variant Scorers
37
+
38
+ ``` {eval-rst}
39
+
40
+ .. autosummary::
41
+ :toctree: generated
42
+
43
+ models.variant_scorers.AggregationType
44
+ models.variant_scorers.BaseVariantScorer
45
+ models.variant_scorers.CenterMaskScorer
46
+ models.variant_scorers.ContactMapScorer
47
+ models.variant_scorers.GeneMaskLFCScorer
48
+ models.variant_scorers.GeneMaskActiveScorer
49
+ models.variant_scorers.GeneMaskSplicingScorer
50
+ models.variant_scorers.PolyadenylationScorer
51
+ models.variant_scorers.SpliceJunctionScorer
52
+ models.variant_scorers.get_recommended_scorers
53
+ models.variant_scorers.tidy_anndata
54
+ models.variant_scorers.tidy_scores
55
+ ```
alphagenome/source/docs/source/api/visualization.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Visualization
2
+
3
+ Visualization and plotting tools.
4
+
5
+ ## Plot
6
+
7
+ ``` {eval-rst}
8
+ .. currentmodule:: alphagenome
9
+ ```
10
+
11
+ ``` {eval-rst}
12
+
13
+ .. autosummary::
14
+ :toctree: generated
15
+
16
+ visualization.plot.seqlogo
17
+ visualization.plot.plot_contact_map
18
+ visualization.plot.plot_track
19
+ visualization.plot.plot_tracks
20
+ visualization.plot.sashimi_plot
21
+ visualization.plot.pad_track
22
+ ```
23
+
24
+ (visualization/plot-components)=
25
+
26
+ ## Plot components
27
+
28
+ ``` {eval-rst}
29
+
30
+ .. autosummary::
31
+ :toctree: generated
32
+
33
+ visualization.plot_components.plot
34
+ visualization.plot_components.AbstractComponent
35
+ visualization.plot_components.Tracks
36
+ visualization.plot_components.OverlaidTracks
37
+ visualization.plot_components.ContactMaps
38
+ visualization.plot_components.ContactMapsDiff
39
+ visualization.plot_components.TranscriptAnnotation
40
+ visualization.plot_components.SeqLogo
41
+ visualization.plot_components.Sashimi
42
+ visualization.plot_components.EmptyComponent
43
+ visualization.plot_components.AbstractAnnotation
44
+ visualization.plot_components.IntervalAnnotation
45
+ visualization.plot_components.VariantAnnotation
46
+ ```
47
+
48
+ ## Plot transcripts
49
+
50
+ ``` {eval-rst}
51
+
52
+ .. autosummary::
53
+ :toctree: generated
54
+
55
+ visualization.plot_transcripts.TranscriptStyle
56
+ visualization.plot_transcripts.TranscriptStylePreset
57
+ visualization.plot_transcripts.plot_transcripts
58
+ visualization.plot_transcripts.draw_interval
59
+ visualization.plot_transcripts.draw_transcript
60
+ ```
alphagenome/source/docs/source/conf.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Configuration file for the Sphinx documentation builder."""
16
+
17
+ #
18
+ # This file only contains a selection of the most common options. For a full
19
+ # list see the documentation:
20
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
21
+
22
+ # -- Path setup --------------------------------------------------------------
23
+
24
+ # If extensions (or modules to document with autodoc) are in another directory,
25
+ # add these directories to sys.path here. If the directory is relative to the
26
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
27
+ #
28
+ import importlib.metadata
29
+ import inspect
30
+ import os
31
+ import sys
32
+
33
+ # The package is installed by Readthedocs before sphinx building.
34
+ import alphagenome # pylint: disable=unused-import, g-import-not-at-top
35
+ import alphagenome.models.dna_client # pylint: disable=unused-import, g-import-not-at-top
36
+
37
+ # -- Project information -----------------------------------------------------
38
+
39
+ project = 'alphagenome'
40
+ project_info = importlib.metadata.metadata(project)
41
+ author = project_info['Author']
42
+ copyright = f'2024, {author}' # pylint: disable=redefined-builtin
43
+ version = project_info['Version']
44
+ repository_url = f'https://github.com/google-deepmind/{project}'
45
+
46
+
47
+ # The full version, including alpha/beta/rc tags
48
+ release = version
49
+ # Warn if links are broken
50
+ nitpicky = True
51
+
52
+ # -- General configuration ---------------------------------------------------
53
+
54
+ # Add any Sphinx extension module names here, as strings. They can be
55
+ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
56
+ # ones.
57
+ extensions = [
58
+ 'myst_nb',
59
+ 'sphinx_design',
60
+ 'sphinx.ext.autodoc',
61
+ 'sphinx.ext.intersphinx',
62
+ 'sphinx.ext.autosummary',
63
+ 'sphinx.ext.napoleon',
64
+ 'sphinxcontrib.bibtex',
65
+ 'sphinx_autodoc_typehints',
66
+ 'sphinx.ext.mathjax',
67
+ 'IPython.sphinxext.ipython_console_highlighting',
68
+ 'sphinx.ext.coverage',
69
+ 'sphinx_copybutton',
70
+ 'sphinx_remove_toctrees',
71
+ 'sphinx.ext.linkcode',
72
+ ]
73
+
74
+ autosummary_generate = True
75
+ autodoc_member_order = 'groupwise'
76
+ default_role = 'literal'
77
+ bibtex_reference_style = 'author_year'
78
+ napoleon_google_docstring = True
79
+ napoleon_numpy_docstring = False
80
+ napoleon_include_init_with_doc = False
81
+ napoleon_use_rtype = True
82
+ napoleon_use_param = True
83
+ myst_heading_anchors = 6 # Create heading anchors for h1-h6
84
+ autodoc_mock_imports = [
85
+ 'google.protobuf.runtime_version',
86
+ 'google.protobuf.internal.builder',
87
+ 'absl',
88
+ 'alphagenome.protos',
89
+ ]
90
+ remove_from_toctrees = ['api/generated/*']
91
+ bibtex_bibfiles = ['refs.bib']
92
+
93
+ myst_enable_extensions = [
94
+ 'amsmath',
95
+ 'colon_fence',
96
+ 'deflist',
97
+ 'dollarmath',
98
+ 'html_image',
99
+ 'html_admonition',
100
+ 'attrs_inline',
101
+ 'attrs_block',
102
+ ]
103
+
104
+ # TODO(b/372225132): Resolve showing notebook output without executing.
105
+ # TODO(b/372226231): Resolve not modifying notebook when building docs.
106
+ myst_url_schemes = ['http', 'https', 'mailto']
107
+ nb_output_stderr = 'remove'
108
+ nb_execution_mode = 'off'
109
+ nb_merge_streams = True
110
+ typehints_defaults = 'braces'
111
+
112
+ source_suffix = {
113
+ '.rst': 'restructuredtext',
114
+ '.ipynb': 'myst-nb',
115
+ '.myst': 'myst-nb',
116
+ }
117
+
118
+ intersphinx_mapping = {
119
+ 'python': ('https://docs.python.org/3', None),
120
+ 'anndata': ('https://anndata.readthedocs.io/en/stable/', None),
121
+ 'numpy': ('https://numpy.org/doc/stable/', None),
122
+ 'jax': ('https://jax.readthedocs.io/en/latest/', None),
123
+ 'pandas': ('https://pandas.pydata.org/docs/', None),
124
+ 'matplotlib': ('https://matplotlib.org/stable/', None),
125
+ }
126
+
127
+ # Add any paths that contain templates here, relative to this directory.
128
+ templates_path = ['_templates']
129
+
130
+ # List of patterns, relative to source directory, that match files and
131
+ # directories to ignore when looking for source files.
132
+ # This pattern also affects html_static_path and html_extra_path.
133
+ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'protos']
134
+
135
+ # -- Options for autodoc -----------------------------------------------------
136
+
137
+ autodoc_default_options = {
138
+ 'member-order': 'bysource',
139
+ 'special-members': True,
140
+ 'exclude-members': '__repr__, __str__, __weakref__',
141
+ }
142
+
143
+
144
+ # -- Source code links -----------------------------------------------------
145
+
146
+
147
+ def linkcode_resolve(domain, info):
148
+ """Resolve a GitHub URL corresponding to Python object."""
149
+ if domain != 'py':
150
+ return None
151
+
152
+ try:
153
+ mod = sys.modules[info['module']]
154
+ except ImportError:
155
+ return None
156
+
157
+ obj = mod
158
+ try:
159
+ for attr in info['fullname'].split('.'):
160
+ obj = getattr(obj, attr)
161
+ except AttributeError:
162
+ return None
163
+ else:
164
+ obj = inspect.unwrap(obj)
165
+
166
+ try:
167
+ filename = inspect.getsourcefile(obj)
168
+ except TypeError:
169
+ return None
170
+
171
+ try:
172
+ source, lineno = inspect.getsourcelines(obj)
173
+ except OSError:
174
+ return None
175
+
176
+ path = os.path.relpath(filename, start=os.path.dirname(alphagenome.__file__))
177
+ return (
178
+ f'{repository_url}/tree/main/src/{project}/'
179
+ f'{path}#L{lineno}#L{lineno + len(source) - 1}'
180
+ )
181
+
182
+
183
+ # -- Options for HTML output -------------------------------------------------
184
+
185
+ # The theme to use for HTML and HTML Help pages. See the documentation for
186
+ # a list of builtin themes.
187
+
188
+ html_theme = 'sphinx_book_theme'
189
+ html_title = 'AlphaGenome'
190
+ pygments_style = 'default'
191
+ html_theme_options = {
192
+ 'repository_url': repository_url,
193
+ 'repository_branch': 'main',
194
+ 'use_repository_button': True,
195
+ 'launch_buttons': {
196
+ 'colab_url': 'https://colab.research.google.com',
197
+ },
198
+ 'article_header_start': ['toggle-primary-sidebar.html', 'breadcrumbs'],
199
+ 'show_prev_next': False,
200
+ }
201
+ # Add any paths that contain custom static files (such as style sheets) here,
202
+ # relative to this directory. They are copied after the builtin static files,
203
+ # so a file named "default.css" will overwrite the builtin "default.css".
204
+ # html_static_path = ['_static']
205
+
206
+ # TODO: b/377291190 - Look at adding notebook support (see haiku example)
alphagenome/source/docs/source/exploring_model_metadata.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model output metadata
2
+
3
+ AlphaGenome returns predictions for 11 different output types, covering a
4
+ variety of modalities. Here we provide details about the human model outputs and
5
+ associated metadata to help users make informed decisions about the parameters
6
+ of their API requests (e.g., ontology term; output types).
7
+
8
+ For further details on dataset processing and precise definitions of each output
9
+ type, including their respective units and normalization methods, please refer
10
+ to the Methods section of the AlphaGenome paper.
11
+
12
+ <!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
13
+ <!-- mdlint off(LINK_ID) -->
14
+
15
+ {#model_metadata-table1-target}
16
+ **Table 1**: Descriptions of output types predicted by AlphaGenome.
17
+
18
+ | OutputType name | Description | Units | Resolution | Unique biosamples | Total tracks |
19
+ | :--- | :--- | :--- | :--- | :--- | :--- |
20
+ | [RNA\_SEQ](#alphagenome.models.dna_output.OutputType.RNA_SEQ) | RNA expression as measured by RNA-seq. Includes a mixture of PolyA+ RNA and Total RNA assays. Some tracks are also stranded. | Normalized read signal | 1bp | 285 | 667 |
21
+ | [CAGE](#alphagenome.models.dna_output.OutputType.CAGE) | RNA expression at transcription start-sites as measured by Cap Analysis Gene Expression (CAGE) assay. | Normalized read signal | 1bp | 264 | 546 |
22
+ | [PROCAP](#alphagenome.models.dna_output.OutputType.PROCAP) | RNA expression at transcription start-sites as measured by Precision Run-On sequencing and capping (PROCAP) assay. | Normalized read signal | 1bp | 6 | 12 |
23
+ | [DNASE](#alphagenome.models.dna_output.OutputType.DNASE) | Chromatin accessibility as measured by DNase I hypersensitive sites sequencing (DNase-seq) assay. | Normalized insertion signal | 1bp | 305 | 305 |
24
+ | [ATAC](#alphagenome.models.dna_output.OutputType.ATAC) | Chromatin accessibility as measured by the transposase-accessible chromatin (ATAC-seq) assay. | Normalized insertion signal | 1bp | 167 | 167 |
25
+ | [CHIP\_HISTONE](#alphagenome.models.dna_output.OutputType.CHIP_HISTONE) | Relative abundance of histone modification marks as measured by chromatin immunoprecipitation (ChIP-seq) for 24 different markers e.g. H3k27ac (see ENCODE [documentation](https://www.encodeproject.org/chip-seq/histone/)). | Fold-change over control | 128bp | 219 | 1116 |
26
+ | [CHIP\_TF](#alphagenome.models.dna_output.OutputType.CHIP_TF) | Relative abundance of DNA-bound transcription factors as measured by ChIP-seq targeting 43 different proteins (see ENCODE [documentation](https://www.encodeproject.org/chip-seq/transcription_factor/)). | Fold-change over control | 128bp | 163 | 1617 |
27
+ | [SPLICE\_SITES](#alphagenome.models.dna_output.OutputType.SPLICE_SITES) | Predicted location of donor or acceptor splice sites, for both the positive and negative strand, expressed as a probability (higher numbers indicate higher probability of the base being a splice site). | Predicted probability | 1bp | NA | 4 |
28
+ | [SPLICE\_JUNCTIONS](#alphagenome.models.dna_output.OutputType.SPLICE_JUNCTIONS) | Splice junction spliced read counts, as measured by RNA-Seq. Predictions are for all possible pairings of at most 512 donors and 512 acceptors from each strand in the requested interval, where the position of donors and acceptors along the input sequence is given by predictions of splice site positions. | Normalized junction signal | 1bp | 282 | 734 |
29
+ | [SPLICE\_SITE\_USAGE](#alphagenome.models.dna_output.OutputType.SPLICE_SITE_USAGE) | Fraction of transcripts using a splice site, as measured by RNA-seq. All reads that span a given splice site are considered, and we predict the fraction of these that use the site (donor or acceptor). | Fraction | 1bp | 282 | 734 |
30
+ | [CONTACT\_MAPS](#alphagenome.models.dna_output.OutputType.CONTACT_MAPS) | Relative frequency of physical contact between pairwise positions (symmetric), derived from chromatin contact maps (Micro-C and Hi-C assays). Values are coarse-grained and normalized by removing the off-diagonal power law decay (as also done in [Zhou, J. 2022](https://www.nature.com/articles/s41588-022-01065-4)). | Log-fold over genomic distance-based expectation | 2048bp | 12 | 28 |
31
+ <!-- mdlint on -->
32
+ <!-- mdformat on -->
33
+
34
+ ## Track metadata
35
+
36
+ To access the metadata describing each track for human outputs use:
37
+
38
+ ```py
39
+ output_metadata = dna_model.output_metadata(
40
+ organism=dna_client.Organism.HOMO_SAPIENS
41
+ )
42
+ ```
43
+
44
+ Each predicted output type (e.g., RNA\_SEQ) contains metadata in a
45
+ {py:class}`~pandas.DataFrame`: {py:class}`output_metadata.rna_seq
46
+ <alphagenome.models.dna_output.OutputMetadata.rna_seq>`
47
+
48
+ Each row of the {class}`~pandas.DataFrame` corresponds to a ‘track’, and each
49
+ column contains key information for biological interpretation such as:
50
+
51
+ * `name`: Name of the track. Example: `CL:0000047 polyA plus RNA-seq`.
52
+ * `strand` Strand of the track, either positive (`+`), negative (`-`), or
53
+ unstranded (`.`).
54
+ * `ontology_curie`: A string ID representing the ontology term corresponding
55
+ to the biosample. Example: `CL:0000100`.
56
+ * `biosample_name`: Plain text description of the biosample. Example: `motor
57
+ neuron`.
58
+
59
+ For a full list of metadata columns available for each output type, please see
60
+ the [navigating data ontologies notebook](colabs/tissue_ontology_mapping), which
61
+ demonstrates how to access and browse track metadata.
62
+
63
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
64
+
65
+ :::{note} For `SPLICE_JUNCTION` outputs the strand information is a property of
66
+ a junction rather than a track, so the metadata for this output type will show
67
+ half as many rows as reported in the above table.
68
+ :::
69
+
70
+ <!-- mdformat on -->
71
+
72
+ ## Additional track metadata
73
+
74
+ Some output types contain additional columns. For example,
75
+ {py:class}`OutputMetadata.rna_seq
76
+ <alphagenome.models.dna_output.OutputMetadata.rna_seq>` and
77
+ {py:class}`OutputMetadata.splice_sites
78
+ <alphagenome.models.dna_output.OutputMetadata.splice_sites>` also contain a
79
+ `gtex_tissue` column, which is populated for the tracks that make predictions
80
+ for the tissues sampled in the
81
+ [GTEx project](https://gtexportal.org/home/samplingSitePage)
82
+ {cite:t}`gtex2020gtex`.
83
+
84
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
85
+ :::{note} For one tissue,
86
+ ’Brain \- Cerebellar hemisphere’, we used an alternative Uberon ID to that was
87
+ provided in the
88
+ [GTEx documentation](https://gtexportal.org/home/samplingSitePage)
89
+ (‘UBERON:0002037’), to reflect Uberon’s ID for cerebellar hemisphere:
90
+ [‘UBERON:0002245'](https://www.ebi.ac.uk/ols4/ontologies/uberon/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FUBERON_0002245).
91
+ :::
92
+
93
+ <!-- mdformat on -->
alphagenome/source/docs/source/faqs.md ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FAQ
2
+
3
+ Frequently asked questions.
4
+
5
+ ## Model inputs
6
+
7
+ ### How do I make predictions for a specific genomic region?
8
+
9
+ You can define any region in either the human or mouse genome, and use the API
10
+ to predict various outputs. See the [quick start colab](colabs/quick_start) for
11
+ a demonstration.
12
+
13
+ ### How do I specify a genomic region?
14
+
15
+ Using the {class}`genome.Interval<alphagenome.data.genome.Interval>` class,
16
+ which is initialized with a chromosome, a start, and an end position.
17
+
18
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
19
+
20
+ :::{note}
21
+ AlphaGenome classes such as {class}`genome.Interval<alphagenome.data.genome.Interval>`
22
+ uses 0-based indexing, consistent with the underlying Python implementations.
23
+
24
+ This means an
25
+ {class}`genome.Interval<alphagenome.data.genome.Interval>` includes the base
26
+ pair at the `start` position up to the base pair at the `end-1` position.
27
+
28
+ For example, to specify the first base pair of chromosome 1, use
29
+ `genome.Interval('chr1', 0, 1)`. This interval has a width of 1, and contains
30
+ only the base pair at the first position of chromosome 1.
31
+
32
+ To interpret interval overlaps, remember that 0-based indexing excludes the base
33
+ pair at the `end` position itself, such that
34
+ `genome.Interval('chr1', 0, 1).overlaps(genome.Interval('chr1', 1, 2))`
35
+ returns `False`.
36
+ :::
37
+
38
+ <!-- mdformat on -->
39
+
40
+ ### What are the reference genome versions used by the model?
41
+
42
+ We use human genome assembly hg38 (GRCh38.p13.genome.fa) and mouse assembly mm10
43
+ (GRCm38.p6.genome.fa). For other genome builds (such as hg19, for example), the
44
+ [LiftOver](https://genome.ucsc.edu/cgi-bin/hgLiftOver) tool can be used to
45
+ convert from hg38 coordinates to the desired assembly.
46
+
47
+ ### Can I make a prediction for any arbitrary DNA sequence?
48
+
49
+ Yes, you can make predictions for any sequence, provided it is within the range
50
+ of sequence lengths supported by the model. Note that model predictions have
51
+ only been evaluated using sequences that vary by a relatively small amount from
52
+ the reference genome (SNPs and indels), so very large differences from the human
53
+ reference genome (for example, structural variants, sequences with a large
54
+ amount of padding, synthetic sequences, or artificial DNA constructs) may result
55
+ in predictions that are not as reliable.
56
+
57
+ ### Can I make predictions for DNA from other species?
58
+
59
+ Yes, with the caveat that the model has only been trained on mouse and human
60
+ DNA. Prediction quality is likely to degrade as evolutionary distance from these
61
+ two species increases, but note that this has not been formally benchmarked.
62
+
63
+ ### What is the longest sequence the model can take as input?
64
+
65
+ 1MB (precisely 2^20 base-pairs long). Other sequence lengths are also supported:
66
+ \~2KB, \~16KB, \~100KB, \~500KB.
67
+
68
+ ### How do I request predictions for a sequence with a length that is not in the list of supported lengths?
69
+
70
+ You can use
71
+ {func}`genome.Interval.resize<alphagenome.data.genome.Interval.resize>` to crop
72
+ or expand your sequence length to the nearest supported length.
73
+
74
+ Note that `.resize` expands sequences using the actual surrounding genomic data,
75
+ not by adding padding.
76
+
77
+ ## Model outputs
78
+
79
+ ### How many tracks are there per output type and what do they represent?
80
+
81
+ This varies from 5 to over 600. Each of the tracks refers to a particular
82
+ cell-type or tissue, as well as other properties, such as strand or a specific
83
+ transcription factor (for the `CHIP_TF` output type). See the
84
+ [output metadata documentation](project:exploring_model_metadata.md#Exploring-model-metadata)
85
+ for a full list of the output types.
86
+
87
+ ### How do I find out what tissue or cell-type an output ‘track’ refers to?
88
+
89
+ Using the [navigating data ontologies notebook](colabs/tissue_ontology_mapping),
90
+ you can look at the output metadata where biosample names and ontology CURIEs
91
+ (IDs) for each track are described.
92
+
93
+ ### What is an ontology CURIE?
94
+
95
+ CURIEs (Compact Uniform Resource Identifiers) are standardized, abbreviated
96
+ codes (e.g., ‘UBERON:0001114’ for liver) that uniquely identify specific
97
+ ontology terms.
98
+
99
+ ### Where are your ontology CURIEs sourced from?
100
+
101
+ We source these from the IDs provided in the source training data. We also
102
+ restricted the ontology types to UBERON, CL, CLO and EFO, following ENCODE
103
+ practices. We recommend using EBI's
104
+ [Ontology Lookup Service](https://www.ebi.ac.uk/ols4) to understand
105
+ relationships between the ontology IDs for different tracks.
106
+
107
+ ### What is strandedness?
108
+
109
+ DNA is double-stranded, meaning that there are two nucleotide strands that form
110
+ the double helix. By convention, one of those molecules is designated the
111
+ forward, or positive strand (5'->3'), and the other is designated the reverse,
112
+ or negative strand (3'->5').
113
+
114
+ Genomic assays can either be unstranded or stranded (also called
115
+ strand-specific).
116
+
117
+ * Unstranded assays return results that do not distinguish whether a
118
+ measurement came from the positive or negative strand. Certain assays do not
119
+ generate stranded information – for example, ATAC-seq generates unstranded
120
+ accessibility information.
121
+ * Stranded (or strand-specific) assays annotate each measurement as coming
122
+ from the positive or negative strand. This is important for transcriptional
123
+ assays to distinguish between strand-specific transcripts (for example, two
124
+ transcripts that share a transcriptional start site but are on different
125
+ strands).
126
+
127
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
128
+
129
+ :::{note}
130
+ Not all RNA-seq samples will be stranded, especially those that are
131
+ from older experiments. For example, GTEx RNA-seq data is unstranded.
132
+ :::
133
+
134
+ <!-- mdformat on -->
135
+
136
+ For more general information about the difference between non-stranded and
137
+ stranded protocols and how to interpret them, there is a helpful tutorial
138
+ [here](https://www.ecseq.com/support/ngs/how-do-strand-specific-sequencing-protocols-work).
139
+
140
+ ### How is strandedness handled in model outputs?
141
+
142
+ In the model output metadata, we use the following symbols to designate the
143
+ strand of a track:
144
+
145
+ * positive: `+`
146
+ * negative: `-`
147
+ * unstranded `.`
148
+
149
+ For assays that were performed in a stranded (or strand-specific) manner, the
150
+ assay will have two tracks per cell or tissue type: one for the positive (`+`)
151
+ and another for the negative (`-`) strand.
152
+
153
+ For unstranded assays, there will be a single track per cell or tissue type,
154
+ annotated as unstranded (`.`).
155
+
156
+ We provide convenience operations for manipulating
157
+ {class}`~alphagenome.data.track_data.TrackData` based on strand information,
158
+ such as
159
+ {func}`~alphagenome.data.track_data.TrackData.filter_to_negative_strand`, etc.
160
+
161
+ ### How can I save the model outputs?
162
+
163
+ For *variant effect predictions*: We recommend converting the scores into a
164
+ pandas DataFrame. This DataFrame can then be easily exported to a common file
165
+ format, such as a CSV file, for use with other tools or for record-keeping.
166
+ Specific instructions and examples for this process are provided in our 'Variant
167
+ Scoring UI' tutorial.
168
+
169
+ For *genome track predictions (e.g., RNA-seq levels)*: The predicted track data
170
+ is provided as NumPy arrays within TrackData objects. These arrays can be
171
+ directly saved to disk using standard NumPy functions, such as `numpy.save` (for
172
+ saving a single array to a `.npy` file) or `numpy.savez_compressed` (for saving
173
+ multiple arrays into a single compressed `.npz` file).
174
+
175
+ ### What are some of the limitations of the model?
176
+
177
+ AlphaGenome has several key limitations:
178
+
179
+ - *Tissue-specificity and long-range interactions*: While AlphaGenome shows
180
+ improvements in these areas compared to previous models, accurately
181
+ capturing tissue-specific effects and long-range genomic interactions
182
+ remains challenging for deep learning models in genomics, requiring further
183
+ research.
184
+ - *Species scope*: The model is trained and evaluated on human and mouse DNA.
185
+ Its performance on DNA from other species has not been determined.
186
+ - *Personal genomes*: The model has not yet been benchmarked for predicting
187
+ individual (personal) human genomes.
188
+ - *Molecular scope*: AlphaGenome predicts the molecular consequences of
189
+ genetic variations. Its direct applicability to complex trait analysis is
190
+ limited, as these traits also involve broader biological processes (e.g.,
191
+ gene function, development, environmental factors) beyond the model's
192
+ primary focus.
193
+ - *Unphased training and single sequence input*: The model processes a single
194
+ DNA sequence at a time and is therefore not inherently 'diploid-aware'. It
195
+ was trained using unphased data, meaning it could not learn to distinguish
196
+ between alleles inherited from the mother versus the father. Consequently,
197
+ its variant effect predictions do not inherently model heterozygous states
198
+ (i.e., the presence of both a reference and a variant allele at a site
199
+ simultaneously).
200
+
201
+ ## Visualizing predictions
202
+
203
+ ### How do I visualize the predicted output?
204
+
205
+ You can use any tool to visualize the numerical output, but we provide a Python
206
+ [visualization library](project:api/visualization.md#Visualization) so you can
207
+ easily visualize the output immediately. You can use our
208
+ [visualization basics guide](project:visualization_library_basics.md) and see
209
+ examples of how to plot different modalities in our
210
+ [visualizing predictions tutorial](colabs/visualization_modality_tour).
211
+
212
+ ### Can I design my own visualizations to work with this library?
213
+
214
+ Yes. The returned figures are based on matplotlib, so should be extendible.
215
+ Additionally, you can choose to work with the raw output data and design your
216
+ own visualizations.
217
+
218
+ ### Where are the plotted transcript annotations from?
219
+
220
+ Transcript annotations are sourced from standard Gene Transfer Format (GTF)
221
+ files from GENCODE: the hg38 reference assembly (release 46) for human and the
222
+ mm10 reference assembly (release M23) for mouse.
223
+
224
+ ### Am I limited to only plotting protein-coding genes, and only the longest transcript?
225
+
226
+ No. If you wish to include other gene types or all transcripts (not just the
227
+ longest), you can remove the respective calls to
228
+ `gene_annotation.filter_protein_coding(gtf)` and
229
+ `gene_annotation.filter_to_longest_transcript(gtf)` in your code. Note that
230
+ including more transcripts can make the plot appear busy; you can adjust the
231
+ `fig_height` parameter of the `TranscriptAnnotation` plot component to improve
232
+ legibility.
233
+
234
+ ## Variant scoring
235
+
236
+ ### How do I define a variant?
237
+
238
+ By creating a {class}`~alphagenome.data.genome.Variant` object.
239
+
240
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
241
+
242
+ :::{note}
243
+ :name: variant-position-is-1-based
244
+ As mentioned above, AlphaGenome classes such as
245
+ {class}`~alphagenome.data.genome.Variant` use 0-indexing, and Variant's
246
+ {func}`~alphagenome.data.genome.Variant.start` and
247
+ {func}`~alphagenome.data.genome.Variant.end` contain 0-indexed values.
248
+
249
+ However, most variants in public databases, such as dbSNP, are provided as
250
+ 1-indexed.
251
+
252
+ To enable compatibility with these annotations, the
253
+ {class}`~alphagenome.data.genome.Variant` object is initialized with a
254
+ 1-indexed {attr}`~alphagenome.data.genome.Variant.position` attribute, which is
255
+ then converted to 0-indexing internally. (i.e.,
256
+ {func}`~alphagenome.data.genome.Variant.start` returns
257
+ {attr}`~alphagenome.data.genome.Variant.position` - 1).
258
+
259
+ See the {class}`~alphagenome.data.genome.Variant` docstring for more details.
260
+ :::
261
+
262
+ <!-- mdformat on -->
263
+
264
+ ### Are there tools to help me define variants, and run inference for them?
265
+
266
+ See the
267
+ [scoring and visualizing a single variant notebook](colabs/variant_scoring_ui)
268
+ which walks through how to define a {class}`~alphagenome.data.genome.Variant`
269
+ object and perform inference. Batch inference over many variants can be
270
+ performed using the
271
+ [batch variant scoring notebook](colabs/batch_variant_scoring) which takes a
272
+ variant call file (VCF) as input.
273
+
274
+ ### Can I pass any sequence to {class}`~alphagenome.data.genome.Variant.reference_bases` or does it have to match the reference genome sequence at the variant location?
275
+
276
+ You can pass any sequence to
277
+ {class}`~alphagenome.data.genome.Variant.reference_bases`. Note that
278
+ {func}`~alphagenome.models.dna_client.DnaClient.predict_variant` is agnostic to
279
+ the alleles in the reference genome, but rather uses the REF/ALT alleles
280
+ specified by the user.
281
+
282
+ ### Are variant predictions for insertions and deletions (indels) supported?
283
+
284
+ Yes. We use left-alignment to specify indels. See
285
+ {class}`~alphagenome.data.genome.Variant` for more details. For scoring indels,
286
+ we adopt SpliceAI's {cite:p}`spliceai` indel alignment strategy: inserted bases
287
+ are summarized by taking the maximum value over the inserted segment, while
288
+ deleted bases are treated as having zero signal in the `ALT` context, thereby
289
+ enabling consistent positional comparisons.
290
+
291
+ ### Which variant scorer should I use for a given modality?
292
+
293
+ In practice, you can use most variant scoring strategies for any modality.
294
+ However, we provide a recommendation for the best strategies based on our
295
+ evaluations in the
296
+ [variant scoring documentation](project:variant_scoring.md#variant-scoring).
297
+
298
+ ### Can I write my own variant scoring strategy?
299
+
300
+ We do not currently support users writing their own variant scoring strategy.
301
+ However, since variant scoring is simply aggregating REF and ALT track
302
+ predictions, you can write your own methods for handling these values.
303
+
304
+ ### What is the difference between a 'quantile_score' and 'raw_score'?
305
+
306
+ The 'raw_score' is the output for a particular variant scoring strategy.
307
+ However, different tracks and modalities yield scores that are on different
308
+ scales. For instance, the
309
+ [Splice Sites Usage scorer](project:variant_scoring.md#splicing-splice-site-usage)
310
+ returns values between 0 and 1, whereas the
311
+ [Gene Expression (RNA-seq)](project:variant_scoring.md#gene-expression-rna-seq)
312
+ scorer returns negative or positive values without bounds. To facilitate
313
+ comparisons across tracks and different variant scoring strategies, we use an
314
+ empirical quantiles approach (see {cite:p}`alphagenome` for full details).
315
+ Briefly, we estimate a background distribution for each variant scorer and track
316
+ using scores for common variants (MAF>0.01 in any GnomAD v3 population). We can
317
+ then convert any 'raw score' into a 'quantile score', representing its rank
318
+ within this background distribution. E.g. a variant with a quantile score of
319
+ 0.99 has a score equivalent to the 99th percentile of common variants. This
320
+ provides a measure of predicted impact that is standardized to the same scale
321
+ across different variant scorers and tracks. The maximum (or minimum) value
322
+ never exceeds 0.999990 (or -0.999990), due to the number of variants used to
323
+ compute the quantiles (~300K). Because of this, we recommend using quantile
324
+ scores as an indicator of whether the raw score is unusually large, and use the
325
+ 'raw scores' as a measure of magnitude of the effect for a given scorer and
326
+ track.
327
+
328
+ For signed variant scores (which indicate effect direction like up-regulation or
329
+ down-regulation), their [0,1] quantile probabilities – derived directly from the
330
+ rank order of the original signed raw scores – are linearly transformed to a
331
+ [-1,1] range. This rescaling ensures the quantile score reflects the
332
+ directionality of the raw score. For instance, the 0th percentile (representing
333
+ the most negative raw scores) maps to -1, the 50th percentile (raw scores around
334
+ zero) to 0, and the 100th percentile (most positive raw scores) to +1.
335
+
336
+ Note that quantile scores are only available for the suite of recommended
337
+ scorers.
338
+
339
+ ## Other
340
+
341
+ ### What terms of use apply to AlphaGenome outputs?
342
+
343
+ The AlphaGenome API is provided for non-commercial use only and is subject to
344
+ the AlphaGenome
345
+ [Terms of Service](https://deepmind.google.com/science/alphagenome/terms).
346
+ Outputs generated by AlphaGenome should not be used for the training of other
347
+ machine learning models.
348
+
349
+ ### How should I cite AlphaGenome?
350
+
351
+ If you use AlphaGenome in your research, please cite using:
352
+
353
+ <!-- disableFinding(SNIPPET_INVALID_LANGUAGE) -->
354
+
355
+ ```bibtex
356
+ @article{alphagenome,
357
+ title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
358
+ author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
359
+ year={2025},
360
+ doi={https://doi.org/10.1101/2025.06.25.661532},
361
+ publisher={Cold Spring Harbor Laboratory},
362
+ journal={bioRxiv}
363
+ }
364
+ ```
365
+
366
+ <!-- enableFinding(SNIPPET_INVALID_LANGUAGE) -->
367
+
368
+ ### Who should I contact with issues, enquiries and feedback?
369
+
370
+ Submit bugs and any code-related issues on
371
+ [GitHub](https://github.com/google-deepmind/alphagenome). For general feedback,
372
+ questions about usage, and/or feature requests, please use the
373
+ [community forum](https://www.alphagenomecommunity.com) – it's actively
374
+ monitored by our team so you're likely to find answers and insights faster. If
375
+ you can't find what you're looking for, please get in touch with the AlphaGenome
376
+ team at <alphagenome@google.com> and we will be happy to assist you with
377
+ questions. We're working hard to answer all inquiries but there may be a short
378
+ delay in our response due to the high volume we are receiving.
alphagenome/source/docs/source/index.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exploring the genome with AlphaGenome
2
+
3
+ This API provides access to AlphaGenome, Google DeepMind’s unifying model for
4
+ deciphering the regulatory code within DNA sequences.
5
+
6
+ AlphaGenome offers multimodal predictions, encompassing diverse functional
7
+ outputs such as gene expression, splicing patterns, chromatin features, and
8
+ contact maps (see diagram below). The model analyzes DNA sequences of up to 1
9
+ million base pairs in length and can deliver predictions at single base-pair
10
+ resolution for most outputs. AlphaGenome achieves state-of-the-art performance
11
+ across a range of genomic prediction benchmarks, including numerous diverse
12
+ variant effect prediction tasks (detailed in {cite:p}`alphagenome`).
13
+
14
+ The API is offered as a free service for
15
+ [non-commercial use](https://deepmind.google.com/science/alphagenome/terms).
16
+ Query rates vary based on demand – it is well suited for smaller to medium-scale
17
+ analyses such as analysing a limited number of genomic regions or variants
18
+ requiring 1000s of predictions, but is likely not suitable for large scale
19
+ analyses requiring more than 1 million predictions.
20
+
21
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
22
+ ```{figure} /_static/model_overview.png
23
+ :width: 600px
24
+ :alt: overview of AlphaGenome
25
+ :name: overview-figure
26
+ ```
27
+ <!-- mdformat on -->
28
+
29
+ ## Getting started
30
+
31
+ You can get started by
32
+ [getting an API key](https://deepmind.google.com/science/alphagenome), and
33
+ following our [Quick Start Guide](./colabs/quick_start.ipynb), or watching our
34
+ [AlphaGenome 101 tutorial](https://youtu.be/Xbvloe13nak). Please also check out
35
+ our installation guide, tutorials with comprehensive overviews of plotting,
36
+ variant scoring and other use cases, and our API reference documentation.
37
+
38
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
39
+ ::::{grid} 1 1 2 3
40
+ :gutter: 2
41
+
42
+ :::{grid-item-card}
43
+ :link: installation
44
+ :link-type: doc
45
+ Installation
46
+ ^^^^^^^^^^^^
47
+
48
+ Install `alphagenome` locally.
49
+ :::
50
+
51
+ :::{grid-item-card}
52
+ :link: tutorials/index
53
+ :link-type: doc
54
+ Tutorials
55
+ ^^^^^^^^^
56
+ The tutorials walk through example usage of the AlphaGenome model.
57
+ :::
58
+
59
+ :::{grid-item-card}
60
+ :link: api/index
61
+ :link-type: doc
62
+ API reference
63
+ ^^^^^^^^^^^^^
64
+
65
+ Reference documentation for the `alphagenome` package.
66
+ :::
67
+ ::::
68
+ <!-- mdformat on -->
69
+
70
+ ``` {toctree}
71
+ :maxdepth: 2
72
+ :hidden: False
73
+
74
+ ../colabs/quick_start
75
+ installation
76
+ api/index
77
+ tutorials/index
78
+ user_guides/index
79
+ faqs
80
+ Community <https://www.alphagenomecommunity.com>
81
+ references
82
+ ```
alphagenome/source/docs/source/installation.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
2
+
3
+ The easiest way to install AlphaGenome is via the published
4
+ [PyPi package](https://pypi.org/project/alphagenome).
5
+
6
+ ```bash
7
+ $ pip install -U alphagenome
8
+ ```
9
+
10
+ This will install the latest version of the `alphagenome` package.
11
+
12
+ You may optionally wish to create a
13
+ [Python Virtual Environment](https://docs.python.org/3/tutorial/venv.html) to
14
+ prevent conflicts with your system's Python environment.
15
+
16
+ ## Google Colab
17
+
18
+ The tutorial notebooks include a cell with the commands necessary to install
19
+ `alphagenome` into a colab runtime.
20
+
21
+ ### Add API key to secrets
22
+
23
+ To make model requests using the tutorial notebooks, you need to add the
24
+ AlphaGenome API key to Colab secrets:
25
+
26
+ 1. Open your Google Colab notebook and click on the 🔑 **Secrets** tab in the
27
+ left panel.
28
+ 1. Create a new secret with the name `ALPHA_GENOME_API_KEY`.
29
+ 1. Copy/paste your API key into the `Value` input box of
30
+ `ALPHA_GENOME_API_KEY`.
31
+ 1. Toggle the button on the left to allow notebook access to the secret.
32
+
33
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
34
+
35
+ ```{figure} /_static/secrets.png
36
+ :width: 600px
37
+ :alt: Image of secrets tab found on left panel.
38
+ :name: secrets-screenshot
39
+ ```
40
+ <!-- mdformat on -->
41
+
42
+ ## Running locally
43
+
44
+ To install a local copy of `alphagenome`, clone a local copy of the repository
45
+ and run `pip install`:
46
+
47
+ ```bash
48
+ $ rm -rf ./alphagenome
49
+ $ git clone https://github.com/google-deepmind/alphagenome.git
50
+ $ pip install -e ./alphagenome
51
+ ```
52
+
53
+ We strongly recommend using a virtual environment management system such as
54
+ [miniconda](https://docs.anaconda.com/miniconda/) or
55
+ [uv](https://docs.astral.sh/uv/pip/environments/).
56
+
57
+ In the case of miniconda, installation would be achieved with the following:
58
+
59
+ ```bash
60
+ conda create -n alphagenome-env python=3.11
61
+ conda activate alphagenome-env
62
+ pip install -e ./alphagenome
63
+ ```
64
+
65
+ ### Updating `alphagenome`
66
+
67
+ Assuming the relevant virtual environment is already activated:
68
+
69
+ ```bash
70
+ cd ./alphagenome
71
+ git pull
72
+ pip install --upgrade .
73
+ ```
alphagenome/source/docs/source/references.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # References
2
+
3
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
4
+ ```{bibliography}
5
+ :cited:
6
+ ```
7
+ <!-- mdformat on -->
alphagenome/source/docs/source/refs.bib ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @article{alphagenome,
2
+ title={{AlphaGenome}: advancing regulatory variant effect prediction with a unified {DNA} sequence model},
3
+ author={Avsec, {\v Z}iga and Latysheva, Natasha and Cheng, Jun and Novati, Guido and Taylor, Kyle R. and Ward, Tom and Bycroft, Clare and Nicolaisen, Lauren and Arvaniti, Eirini and Pan, Joshua and Thomas, Raina and Dutordoir, Vincent and Perino, Matteo and De, Soham and Karollus, Alexander and Gayoso, Adam and Sargeant, Toby and Mottram, Anne and Wong, Lai Hong and Drot{\'a}r, Pavol and Kosiorek, Adam and Senior, Andrew and Tanburn, Richard and Applebaum, Taylor and Basu, Souradeep and Hassabis, Demis and Kohli, Pushmeet},
4
+ year={2025},
5
+ doi={https://doi.org/10.1101/2025.06.25.661532},
6
+ publisher={Cold Spring Harbor Laboratory},
7
+ journal={bioRxiv}
8
+ }
9
+
10
+ @article{gtex2020gtex,
11
+ title={The GTEx Consortium atlas of genetic regulatory effects across human tissues},
12
+ author={GTEx Consortium},
13
+ journal={Science},
14
+ volume={369},
15
+ number={6509},
16
+ pages={1318--1330},
17
+ year={2020},
18
+ publisher={American Association for the Advancement of Science}
19
+ }
20
+
21
+ @article{zhou2022sequence,
22
+ title={Sequence-based modeling of three-dimensional genome architecture from kilobase to chromosome scale},
23
+ author={Zhou, Jian},
24
+ journal={Nature genetics},
25
+ volume={54},
26
+ number={5},
27
+ pages={725--734},
28
+ year={2022},
29
+ publisher={Nature Publishing Group US New York}
30
+ }
31
+
32
+ @article{borzoi,
33
+ title={Predicting RNA-seq coverage from DNA sequence as a unifying model of gene regulation},
34
+ author={Linder, Johannes and Srivastava, Divyanshi and Yuan, Han and Agarwal, Vikram and Kelley, David R},
35
+ journal={Nature Genetics},
36
+ pages={1--13},
37
+ year={2025},
38
+ publisher={Nature Publishing Group US New York}
39
+ }
40
+
41
+ @article{spliceai,
42
+ title={Predicting splicing from primary sequence with deep learning},
43
+ author={Jaganathan, Kishore and Panagiotopoulou, Sofia Kyriazopoulou and McRae, Jeremy F and Darbandi, Siavash Fazel and Knowles, David and Li, Yang I and Kosmicki, Jack A and Arbelaez, Juan and Cui, Wenwu and Schwartz, Grace B and others},
44
+ journal={Cell},
45
+ volume={176},
46
+ number={3},
47
+ pages={535--548},
48
+ year={2019},
49
+ publisher={Elsevier}
50
+ }
alphagenome/source/docs/source/tutorials/index.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tutorials
2
+
3
+ ``` {toctree}
4
+ :maxdepth: 1
5
+ :hidden:
6
+
7
+ ../colabs/visualization_modality_tour
8
+ ../colabs/variant_scoring_ui
9
+ ../colabs/tissue_ontology_mapping
10
+ ../colabs/batch_variant_scoring
11
+ ../colabs/example_analysis_workflow
12
+ ```
13
+
14
+ <!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
15
+
16
+ ::::{grid} 1 1 2 3
17
+ :gutter: 2
18
+
19
+ :::{grid-item-card} Visualizing predictions
20
+ :link: ../colabs/visualization_modality_tour
21
+ :link-type: doc
22
+
23
+ How to visualize different output modalities.
24
+ :::
25
+
26
+ :::{grid-item-card} Scoring and visualizing a single variant
27
+ :link: ../colabs/variant_scoring_ui
28
+ :link-type: doc
29
+
30
+ Tool for scoring and visualizing a single variant across multiple modalities.
31
+ :::
32
+
33
+ :::{grid-item-card} Navigating data ontologies
34
+ :link: ../colabs/tissue_ontology_mapping
35
+ :link-type: doc
36
+
37
+ Tool for fetching ontology IDs for a given tissue.
38
+
39
+ :::
40
+
41
+ :::{grid-item-card} Batch variant scoring
42
+ :link: ../colabs/batch_variant_scoring
43
+ :link-type: doc
44
+
45
+ Tool for scoring many variants at once.
46
+ :::
47
+
48
+ :::{grid-item-card} Example analysis workflow
49
+ :link: ../colabs/example_analysis_workflow
50
+ :link-type: doc
51
+
52
+ Example analysis of TAL1 locus.
53
+ :::
54
+
55
+ <!-- mdformat on -->
alphagenome/source/docs/source/user_guides/index.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # User guides
2
+
3
+ ``` {toctree}
4
+ :maxdepth: 1
5
+ :hidden:
6
+
7
+ ../colabs/essential_commands
8
+ ../exploring_model_metadata
9
+ ../variant_scoring
10
+ ../visualization_library_basics
11
+ ```
12
+
13
+ <!-- mdformat off(Turn off mdformat to retain grid card syntax.) -->
14
+
15
+ ::::{grid} 1 1 2 3
16
+ :gutter: 2
17
+
18
+ :::{grid-item-card} Essential commands
19
+ :link: ../colabs/essential_commands
20
+ :link-type: doc
21
+
22
+ Essential commands for navigating AlphaGenome.
23
+ :::
24
+
25
+ :::{grid-item-card} Model output metadata
26
+ :link: ../exploring_model_metadata
27
+ :link-type: doc
28
+
29
+ A summary of model outputs and associated metadata.
30
+ :::
31
+
32
+ :::{grid-item-card} Variant scoring
33
+ :link: ../variant_scoring
34
+ :link-type: doc
35
+
36
+ Overview of how variant scores are calculated.
37
+ :::
38
+
39
+ :::{grid-item-card} Visualization basics
40
+ :link: ../visualization_library_basics
41
+ :link-type: doc
42
+
43
+ Guide to visualization tools.
44
+ :::
45
+
46
+ <!-- mdformat on -->
alphagenome/source/docs/source/variant_scoring.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How variant scoring works
2
+
3
+ A genomic variant is a difference identified in an individual's genome sequence
4
+ when compared to the reference genome sequence. Many genomic variants likely
5
+ have no appreciable impact, but it can be challenging to identify those that do
6
+ have a particular molecular effect. AlphaGenome predictions can be used to score
7
+ variants and help bridge this gap.
8
+
9
+ To do so, the variant is treated as a pair of sequences: reference (`REF`) and
10
+ alternate (`ALT`). The variant effect is estimated by comparing AlphaGenome
11
+ predictions for these two sequences across different modalities returned by the
12
+ model.
13
+
14
+ ## Detailed steps
15
+
16
+ Variant scoring is implemented as follows:
17
+
18
+ ### Make `REF` and `ALT` predictions for given modality
19
+
20
+ Variant scoring begins by generating predictions for both the reference and
21
+ alternative alleles of a variant, restricted to a given modality of interest
22
+ (ex: `RNA-SEQ`, `ATAC`, etc.).
23
+
24
+ The model input at this stage are `REF` and `ALT` sequences, whose sequence
25
+ interval contains the variant of interest.
26
+
27
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
28
+
29
+ ```{figure} /_static/variant_scoring_ref_alt.png
30
+ :width: 500px
31
+ :alt: Make `REF` and `ALT` predictions for given modality.
32
+ :name: variant-scoring-1
33
+ ```
34
+
35
+ <!-- mdformat on -->
36
+
37
+ ### Optional - perform indel alignment
38
+
39
+ For insertion or deletion (indel) variants, the `ALT` allele's prediction
40
+ profile is aligned to the `REF` allele's coordinate space. Inserted bases are
41
+ summarized by taking the maximum value over the inserted segment, while deleted
42
+ bases are treated as having zero signal in the `ALT` context, thereby enabling
43
+ consistent positional comparisons.
44
+
45
+ ### Apply spatial mask
46
+
47
+ A spatial mask defines regions of interest within the interval containing the
48
+ variant. This mask can be centered on the variant or encompass a gene (gene
49
+ body, exons, or TSS, based on annotations from a GTF file).
50
+
51
+ At this stage, values outside of the mask are discarded.
52
+
53
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
54
+
55
+ ```{figure} /_static/variant_scoring_spatial_mask.png
56
+ :width: 500px
57
+ :alt: Apply spatial mask.
58
+ :name: variant-scoring-2
59
+ ```
60
+
61
+ <!-- mdformat on -->
62
+
63
+ ### Aggregate spatially and compute `ALT - REF`
64
+
65
+ Aggregation occurs at this stage, which includes the following:
66
+
67
+ * reduction along the spatial axis, using `mean` or `sum`, etc.
68
+ * (optional) scaling, such as a $log$ or $l^2$ transform.
69
+ * difference between `ALT - REF`.
70
+
71
+ The final outcome is a single scalar value per track.
72
+
73
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
74
+
75
+ ```{figure} /_static/variant_scoring_spatial_compute.png
76
+ :width: 500px
77
+ :alt: Aggregate spatially and compute `ALT - REF`.
78
+ :name: variant-scoring-3
79
+ ```
80
+
81
+ <!-- mdformat on -->
82
+
83
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
84
+
85
+ ```{note}
86
+ Aggregation logic is encapsulated in the options listed in
87
+ {class}`~alphagenome.models.variant_scorers.AggregationType`.
88
+
89
+ The naming of the options reflects the order of operations of each of the above
90
+ steps, with the right-most operation applied first to the model predictions.
91
+
92
+ For example,
93
+ {class}`~alphagenome.models.variant_scorers.AggregationType.DIFF_SUM_LOG2`,
94
+ applies a log transform, then a sum, to track data. It then returns the
95
+ difference between `ALT - REF`.
96
+
97
+ Some aggregation options may apply the exact same steps, but in a different order.
98
+
99
+ Regardless of the order of operations, each aggregation type returns one single
100
+ scalar value per track.
101
+ ```
102
+
103
+ <!-- mdformat on -->
104
+
105
+ ### Optional - aggregate tracks
106
+
107
+ After variant scoring is completed, optional track selection and additional
108
+ aggregation can be applied.
109
+
110
+ Suggestions include additional aggregation (mean, max, sum, etc.) over:
111
+
112
+ * All tracks
113
+ * Subsets of tracks
114
+
115
+ Or, a single track of interest can be chosen, i.e., from a particular sample.
116
+
117
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
118
+
119
+ ```{figure} /_static/variant_scoring_aggregate.png
120
+ :width: 500px
121
+ :alt: Optional - aggregate tracks.
122
+ :name: variant-scoring-4
123
+ ```
124
+
125
+ <!-- mdformat on -->
126
+
127
+ ## Modality-specific recommended variant scorers
128
+
129
+ We have established a set of recommended variant scorers, available via
130
+ {func}`~alphagenome.models.variant_scorers.get_recommended_scorers`, covering
131
+ diverse genomic modalities as outlined below:
132
+
133
+ ### Gene Expression (RNA-seq)
134
+
135
+ Variant scores quantify the impact on overall gene transcript abundance.
136
+
137
+ * comparison: predicted RNA coverage between `REF` and `ALT` alleles
138
+ * mask: exons for a gene of interest
139
+ * aggregation: Log-fold change of gene expression level between the `ALT` and
140
+ `REF` alleles: {math}`\log(mean(ALT) + 0.001) - log(mean(REF) + 0.001)`
141
+
142
+ ### Polyadenylation Site (PAS) Usage
143
+
144
+ This follows Borzoi's {cite:p}`borzoi` methodology for scoring polyadenylation
145
+ quantitative trait loci (paQTLs), which captures the variant's impact on RNA
146
+ isoform production.
147
+
148
+ * comparison: predicted RNA coverage between `REF` and `ALT` alleles
149
+ * mask: local 400-bp windows around 3' cleavage junctions
150
+ * aggregation: Maximum absolute log-fold change of isoform ratios
151
+ (distal/proximal PAS usage) between `REF` and `ALT`, considering all
152
+ proximal/distal splits.
153
+
154
+ ### TSS Activity (CAGE, PRO-cap)
155
+
156
+ Variant scores quantify local changes at TSSs.
157
+
158
+ * comparison: predicted CAGE or PRO-cap coverage between `REF` and `ALT`
159
+ alleles
160
+ * mask: local 501-bp window centered at the variant
161
+ * aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
162
+ (sum(REF) + 1)]`
163
+
164
+ ### Chromatin Accessibility (ATAC-seq, DNase-seq)
165
+
166
+ Variant scores quantify local accessibility changes.
167
+
168
+ * comparison: predicted ATAC-seq or DNase-cap coverage between `REF` and `ALT`
169
+ alleles
170
+ * mask: local 501-bp window centered at the variant
171
+ * aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
172
+ (sum(REF) + 1)]`
173
+
174
+ ### Transcription Factor Binding (ChIP-TF)
175
+
176
+ Variant scores quantify changes in TF binding intensity.
177
+
178
+ * comparison: predicted ChIP-TF coverage between `REF` and `ALT` alleles
179
+ * mask: local 501-bp window centered at the variant
180
+ * aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
181
+ (sum(REF) + 1)]`
182
+
183
+ ### Histone Modifications (ChIP-Histone)
184
+
185
+ Variant scores quantify changes in histone modifications.
186
+
187
+ * comparison: predicted ChIP-Histone coverage between `REF` and `ALT` alleles
188
+ * mask: local 2001-bp window centered at the variant
189
+ * aggregation: Log2-ratio of summed signals: {math}`log2[(sum(ALT) + 1) /
190
+ (sum(REF) + 1)]`
191
+
192
+ ### Splicing (Splice Sites)
193
+
194
+ Variant scores quantify changes in the class assignment probabilities (acceptor,
195
+ donor) at all potential splice sites within a gene body.
196
+
197
+ * comparison: class assignment probabilities for `REF` and `ALT` alleles
198
+ * mask: gene body for a gene of interest
199
+ * aggregation: Maximum absolute difference of predicted splice site
200
+ probabilities across the gene body: {math}`max(|ALT - REF|)`
201
+
202
+ ### Splicing (Splice Site Usage)
203
+
204
+ Variant scores quantify changes in the usage of splice sites (i.e., increased or
205
+ decreased fractions).
206
+
207
+ * comparison: predicted splice site usage between `REF` and `ALT` alleles
208
+ * mask: gene body for a gene of interest
209
+ * aggregation: Maximum absolute difference of predicted splice site usage
210
+ across the gene body: {math}`max(|ALT - REF|)`
211
+
212
+ ### Splicing (Splice Junctions)
213
+
214
+ Variant scores quantify changes in the predicted RNA-seq reads spanning a
215
+ junction, which is a function of both expression level, splice site usage and
216
+ splicing efficiency.
217
+
218
+ * comparison: predicted paired junction counts between `REF` and `ALT` alleles
219
+ * mask: top-k splice sites for a gene of interest (including annotated and
220
+ predicted splice sites)
221
+ * aggregation: Maximum absolute log-fold change of predicted junction counts
222
+ across splice site pairs of interest: {math}`max(|log(ALT) - log(REF)|)`
223
+
224
+ ### 3D Genome Contact (Contact Maps)
225
+
226
+ Variant scores quantify local contact disruption.
227
+
228
+ * comparison: predicted contact frequencies between `REF` and `ALT` alleles
229
+ * mask: local 1MB window centered at the variant
230
+ * aggregation: Mean absolute difference of contact frequencies, for all
231
+ interactions involving the variant-containing bin.
232
+
233
+ ### Active Allele Scorers
234
+
235
+ In addition to the differential scores described above, we also provide scoring
236
+ configurations that capture the absolute activity level associated with one of
237
+ the alleles, rather than quantifying the change between `REF` and `ALT`. This is
238
+ calculated by taking the maximum of the aggregated signals from the `REF` and
239
+ `ALT` alleles over the masked central window or gene region.
240
+
241
+ We provide recommended active allele scorers for the following modalities:
242
+
243
+ * Gene expression (RNA-seq): {math}`max(mean(ALT), mean(REF))` across exons
244
+ for a gene of interest
245
+ * TSS activity (CAGE, PRO-cap): {math}`max(sum(ALT), sum(REF))` within a local
246
+ 501-bp window centered at the variant
247
+ * Chromatin Accessibility (ATAC-seq, DNase-seq): {math}`max(sum(ALT),
248
+ sum(REF))` within a local 501-bp window centered at the variant
249
+ * Transcription Factor binding (ChIP-TF): {math}`max(sum(ALT), sum(REF))`
250
+ within a local 501-bp window centered at the variant
251
+ * Histone modifications (ChIP-Histone): {math}`max(sum(ALT), sum(REF))` within
252
+ a local 2001-bp window centered at the variant
253
+
254
+ ## Available variant scorers
255
+
256
+ For more on the types of variant scorers and how they work, visit the
257
+ [API documentation](api/models.md#variant-scorers).
alphagenome/source/docs/source/visualization_library_basics.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Visualization basics
2
+
3
+ <!-- disableFinding(LINK_ID) -->
4
+
5
+ AlphaGenome predicts a variety of output types with different data shapes and
6
+ biological interpretations ([table](#viz-table)). We provide
7
+ [`alphagenome.visualization`](project:api/visualization.md) to generate
8
+ matplotlib figures from model API outputs, which we outline here.
9
+
10
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
11
+
12
+ ```{tip}
13
+ See the {doc}`visualizing predictions tutorial </colabs/visualization_modality_tour>`
14
+ for worked examples of plotting different modalities.
15
+ ```
16
+
17
+ <!-- mdformat on -->
18
+
19
+ ## Plot
20
+
21
+ The key function, {func}`~alphagenome.visualization.plot_components.plot`, takes
22
+ as input a list of components and returns a {class}`matplotlib.figure.Figure`.
23
+
24
+ ## Components
25
+
26
+ A component is a light wrapper around a model output (such as predicted genomic
27
+ tracks, splice junctions, etc) and specifies plot aesthetics. Each component
28
+ maps to one vertically stacked subplot in the final figure (see blue text in the
29
+ [figure](#viz-figure)). Each component has an independent y-axis but shares a
30
+ common x-axis, corresponding to the length of the DNA interval, in base pairs
31
+ (bp).
32
+
33
+ Several default components are available, each designed to best visually
34
+ represent different modalities and data shapes returned by the model API (see
35
+ [table](#viz-table)).
36
+
37
+ ## Annotations
38
+
39
+ Additional figure elements specific to the DNA interval, but outside of
40
+ components -- such as locations of promoters or variants -- can be overlaid via
41
+ a list of annotations that are passed to
42
+ {func}`~alphagenome.visualization.plot_components.plot`.
43
+
44
+ ## Custom plotting
45
+
46
+ For users interested in configuring novel components, extend the
47
+ {func}`~alphagenome.visualization.plot_components.AbstractComponent` and
48
+ {func}`~alphagenome.visualization.plot_components.AbstractAnnotation` base
49
+ classes.
50
+
51
+ Any other data supplied by the user can be visualized using this library as is,
52
+ as long as it is provided to
53
+ [`plot_components`](project:api/visualization.md#plot-components) in the format
54
+ required e.g. {class}`~alphagenome.data.track_data.TrackData` for
55
+ {class}`~alphagenome.visualization.plot_components.Tracks`.
56
+
57
+ <!-- mdformat off(Turn off mdformat to retain myst syntax.) -->
58
+
59
+ ```{figure} /_static/visualization_overview.png
60
+ :height: 600px
61
+ :alt: visualization library description/overview
62
+ :name: viz-figure
63
+
64
+ Illustrative diagram of visualization library. Blue text indicates
65
+ [`plot_components`](<project:api/visualization.md#plot-components>) classes, and purple text indicates arguments to
66
+ [`plot_components`](<project:api/visualization.md#plot-components>) that adjust figure-wide aesthetics
67
+ ```
68
+
69
+ ```{list-table} Plotting components and annotation classes.
70
+ :widths: 10 30 10 10 30 10
71
+ :header-rows: 1
72
+ :name: viz-table
73
+
74
+ * - Component name plot\_components.\*
75
+ - Description
76
+ - Example figure
77
+ - Data shape supported
78
+ - Recommended model outputs
79
+ - Good for visualising variants?
80
+ * - {class}`~alphagenome.visualization.plot_components.Tracks`
81
+ - A line-plot visualizing a scalar value at each genomic position (or
82
+ coarser resolution) e.g. predictions of RNA\_SEQ for a specific
83
+ - Colab cell
84
+ - 1D
85
+ - All except SPLICE\_JUNCTIONS; CONTACT\_MAPS
86
+ - No
87
+ * - {class}`~alphagenome.visualization.plot_components.OverlaidTracks`
88
+ - A line-plot as for Tracks, but with two separate lines on the same axis
89
+ with different colors e.g. predictions of RNA\_SEQ for the Reference and
90
+ Alternative sequence defined by a variant.
91
+ - Colab cell
92
+ - 1D x 2
93
+ - All except SPLICE\_JUNCTIONS; CONTACT\_MAPS
94
+ - Yes
95
+ * - {class}`~alphagenome.visualization.plot_components.Sashimi`
96
+ - A series of arcs, each representing a scalar value for a pair of genomic
97
+ positions (e.g. splice junctions). The thickness of the arcs are
98
+ determined by the relative sizes of the scalars.
99
+ - Colab cell
100
+ - 2D (sparse)
101
+ - SPLICE\_JUNCTIONS
102
+ - Yes
103
+ * - {class}`~alphagenome.visualization.plot_components.SeqLogo`
104
+ - A sequence of letters (bases) with heights corresponding to a single
105
+ scalar value per genomic position (e.g. from contribution scores).
106
+ - Colab cell
107
+ - 1D \+ sequence
108
+ - ISM contribution scores
109
+ - Yes
110
+ * - {class}`~alphagenome.visualization.plot_components.ContactMaps`
111
+ - A heatmap visualizing a matrix of scalars (e.g. predicted DNA-DNA
112
+ contacts), one for each pair of genomic positions in an interval.
113
+ - Colab cell
114
+ - 2D
115
+ - CONTACT\_MAPS
116
+ - No
117
+ * - {class}`~alphagenome.visualization.plot_components.ContactMapsDiff`
118
+ - A heatmap as for ContactMaps, but with a diverging color map centered on
119
+ zero (white) to represent values derived from differences (e.g. ALT \-
120
+ REF)
121
+ - Colab cell
122
+ - 2D
123
+ - CONTACT\_MAPS
124
+ - Yes
125
+ * - {class}`~alphagenome.visualization.plot_components.TranscriptAnnotation`
126
+ - Horizontal lines representing locations of transcripts. Exons, introns,
127
+ untranslated regions, and direction of transcription are indicated by
128
+ differences in line thickness.
129
+ - Colab cell
130
+ - Interval(s)
131
+ - N/A
132
+ - No
133
+ * - {class}`~alphagenome.visualization.plot_components.VariantAnnotation`
134
+ - A semi-transparent rectangle (or vertical line if a variant) spanning
135
+ all plot components, indicating the location of an interval (or
136
+ variant). The interval (variant) is optionally labeled.
137
+ - Colab cell
138
+ - Interval(s) or Variant(s)
139
+ - N/A
140
+ - Yes
141
+ * - {class}`~alphagenome.visualization.plot_components.AbstractComponent`
142
+ - This is an abstract class, which is the parent class of most
143
+ plot\_components.\*. A user can define their own component class,
144
+ provided it adheres to the structure specified by AbstractComponent. The
145
+ workhorse method is plot\_ax(), which populates a matplotlib.axes.Axes
146
+ object with visuals defined by the input data.
147
+ - N/A
148
+ - N/A
149
+ - N/A
150
+ - N/A
151
+ ```
152
+
153
+ <!-- mdformat on -->
alphagenome/source/hatch_build.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Hatch build hook to generate Python bindings for protos."""
16
+
17
+ import os
18
+ from typing import Any
19
+ from grpc_tools import protoc
20
+ from hatchling.builders.hooks.plugin.interface import BuildHookInterface # pylint: disable=g-importing-member
21
+
22
+
23
+ _ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'src')
24
+
25
+ # Tuple of proto message definitions to build Python bindings for. Paths must
26
+ # be relative to root directory.
27
+ _ALPHAGENOME_PROTOS = (
28
+ 'alphagenome/protos/dna_model.proto',
29
+ 'alphagenome/protos/dna_model_service.proto',
30
+ 'alphagenome/protos/tensor.proto',
31
+ )
32
+
33
+
34
+ class GenerateProtos(BuildHookInterface):
35
+ """Generates Python protobuf bindings for alphagenome.protos."""
36
+
37
+ def initialize(self, version: str, build_data: dict[str, Any]) -> None:
38
+ del version, build_data # Unused.
39
+
40
+ for proto_path in _ALPHAGENOME_PROTOS:
41
+ proto_args = [
42
+ 'grpc_tools.protoc',
43
+ f'--proto_path={_ROOT_DIR}',
44
+ f'--python_out={_ROOT_DIR}',
45
+ f'--grpc_python_out={_ROOT_DIR}',
46
+ os.path.join(_ROOT_DIR, proto_path),
47
+ ]
48
+ if protoc.main(proto_args) != 0:
49
+ raise RuntimeError(f'ERROR: {proto_args}')
alphagenome/source/pyproject.toml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ build-backend = 'hatchling.build'
3
+ requires = ['hatchling', 'grpcio-tools<=1.67.1', 'importlib-resources']
4
+
5
+
6
+ [project]
7
+ name = 'alphagenome'
8
+ description = 'A Python SDK for interacting and visualizing genomic models.'
9
+ readme = 'README.md'
10
+ dynamic = ['version']
11
+ license = { file = 'LICENSE' }
12
+ requires-python = '>=3.10'
13
+ authors = [
14
+ {name = 'Google LLC'},
15
+ {email = 'alphagenome@google.com'},
16
+ ]
17
+ keywords = [
18
+ 'python',
19
+ 'machine learning',
20
+ 'genomics'
21
+ ]
22
+ classifiers=[
23
+ 'Development Status :: 4 - Beta',
24
+ 'Environment :: Console',
25
+ 'Intended Audience :: Science/Research',
26
+ 'License :: OSI Approved :: Apache Software License',
27
+ 'Operating System :: OS Independent',
28
+ 'Programming Language :: Python :: 3.10',
29
+ 'Programming Language :: Python :: 3.11',
30
+ 'Programming Language :: Python :: 3.12',
31
+ 'Programming Language :: Python :: 3.13',
32
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
33
+ ]
34
+ dependencies=[
35
+ # keep-sorted start
36
+ 'absl-py',
37
+ 'anndata',
38
+ 'grpcio>=1.67.1',
39
+ 'immutabledict',
40
+ 'intervaltree',
41
+ 'jaxtyping',
42
+ 'matplotlib',
43
+ 'ml_dtypes',
44
+ 'numpy',
45
+ 'pandas',
46
+ 'protobuf>=5.28.3',
47
+ 'pyarrow',
48
+ 'scipy',
49
+ 'seaborn',
50
+ 'tqdm',
51
+ 'typeguard',
52
+ 'typing_extensions',
53
+ 'zstandard',
54
+ # keep-sorted end
55
+ ]
56
+
57
+ [project.urls]
58
+ Repository = 'https://github.com/google-deepmind/alphagenome'
59
+ Documentation = 'https://www.alphagenomedocs.com/'
60
+
61
+ [project.optional-dependencies]
62
+ dev = [
63
+ 'hatch',
64
+ ]
65
+ docs = [
66
+ 'ipykernel',
67
+ 'ipython',
68
+ 'myst-nb',
69
+ 'sphinx>=5.0',
70
+ 'sphinx-autodoc-typehints',
71
+ 'sphinx-book-theme',
72
+ 'sphinx-copybutton',
73
+ 'sphinx-remove-toctrees',
74
+ 'sphinx-design',
75
+ 'sphinxcontrib-bibtex>=1.0.0',
76
+ ]
77
+ scripts = [
78
+ 'absl-py',
79
+ 'pyarrow',
80
+ 'pyranges',
81
+ ]
82
+
83
+ # Calls hatch_build.py to generate Python bindings for protos.
84
+ [tool.hatch.build.hooks.custom]
85
+
86
+ [tool.hatch.version]
87
+ path = 'src/alphagenome/__init__.py'
88
+
89
+ [tool.hatch.envs.default]
90
+ installer = 'uv'
91
+
92
+ [tool.setuptools.packages.find]
93
+ include = ['README.md', 'LICENSE']
94
+ exclude = ['*_test.py', 'examples']
95
+
96
+ [tool.hatch.envs.hatch-test]
97
+ default-args = []
98
+ extra-dependencies=['google-benchmark', 'typeguard==2.13.3']
99
+ parallel = true
100
+
101
+
102
+ [tool.hatch.envs.hatch-test.env-vars]
103
+ MPLBACKEND = 'agg'
104
+
105
+ [[tool.hatch.envs.hatch-test.matrix]]
106
+ # Use hatch test --all to run tests on all supported Python versions.
107
+ python = ['3.13', '3.12', '3.11', '3.10']
108
+
109
+ [tool.hatch.envs.check]
110
+ dependencies = [
111
+ 'pyink>=24.3.0',
112
+ 'pylint>=2.6.0',
113
+ ]
114
+ # Do not install dependencies for the check environment.
115
+ detached = true
116
+
117
+ [tool.hatch.envs.check.scripts]
118
+ format = 'pyink . --check'
119
+ lint = 'pylint .'
120
+ all = [
121
+ 'format',
122
+ 'lint',
123
+ ]
124
+
125
+ [tool.pyink]
126
+ # Formatting configuration to follow Google style-guide
127
+ line-length = 80
128
+ unstable = true
129
+ pyink-indentation = 2
130
+ pyink-use-majority-quotes = true
131
+ exclude = 'src/alphagenome/protos'
alphagenome/source/scripts/process_gtf.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Script to process GTF into feather file."""
16
+
17
+ from absl import app
18
+ from absl import flags
19
+ from absl import logging
20
+ import pyranges
21
+
22
+
23
+ _GTF_PATH = flags.DEFINE_string(
24
+ 'gtf_path', None, 'Path to GTF file.', required=True
25
+ )
26
+ _OUTPUT_PATH = flags.DEFINE_string(
27
+ 'output_path', None, 'Path to output feather file.', required=True
28
+ )
29
+
30
+
31
+ def main(_) -> None:
32
+ logging.info('Reading GTF from %s', _GTF_PATH.value)
33
+ gtf = pyranges.read_gtf(_GTF_PATH.value, as_df=True)
34
+
35
+ gtf['gene_id_nopatch'] = gtf['gene_id'].str.split('.', expand=True)[0]
36
+
37
+ logging.info('Writing GTF to %s', _OUTPUT_PATH.value)
38
+ gtf.to_feather(_OUTPUT_PATH.value)
39
+
40
+
41
+ if __name__ == '__main__':
42
+ app.run(main)
alphagenome/source/src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ src Package Initialization File
4
+ """
alphagenome/source/src/alphagenome/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """A Python SDK for interacting and visualizing genomic models."""
16
+
17
+
18
+ __version__ = '0.2.0'
alphagenome/source/src/alphagenome/colab_utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Utility functions for Google Colab."""
16
+
17
+ import os
18
+
19
+
20
+ def get_api_key(secret: str = 'ALPHA_GENOME_API_KEY'):
21
+ """Returns API key from environment variable or Colab secrets.
22
+
23
+ Tries to retrieve the API key from the environment first. If not found,
24
+ attempts to retrieve it from Colab secrets (if running in Colab).
25
+
26
+ Args:
27
+ secret: The name of the environment variable or Colab secret key to
28
+ retrieve.
29
+
30
+ Raises:
31
+ ValueError: If the API key cannot be found in the environment or Colab
32
+ secrets.
33
+ """
34
+
35
+ if api_key := os.environ.get(secret):
36
+ return api_key
37
+
38
+ try:
39
+ # pylint: disable=g-import-not-at-top, import-outside-toplevel
40
+ from google.colab import userdata # pytype: disable=import-error
41
+ # pylint: enable=g-import-not-at-top, import-outside-toplevel
42
+
43
+ try:
44
+ api_key = userdata.get(secret)
45
+ return api_key
46
+ except (
47
+ userdata.NotebookAccessError,
48
+ userdata.SecretNotFoundError,
49
+ userdata.TimeoutException,
50
+ ) as e:
51
+ raise ValueError(
52
+ f'Cannot find or access API key in Colab secrets with {secret=}. Make'
53
+ ' sure you have added the API key to Colab secrets and enabled'
54
+ ' access. See'
55
+ ' https://www.alphagenomedocs.com/installation.html#add-api-key-to-secrets'
56
+ ' for more details.'
57
+ ) from e
58
+ except ImportError:
59
+ # Not running in Colab.
60
+ pass
61
+
62
+ raise ValueError(f'Cannot find API key with {secret=}.')
alphagenome/source/src/alphagenome/colab_utils_test.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import sys
17
+ from unittest import mock
18
+
19
+ from absl.testing import absltest
20
+ from alphagenome import colab_utils
21
+
22
+
23
+ _TEST_SECRET_KEY = '_TEST_ALPHAGENOME_API_KEY'
24
+
25
+
26
+ class ColabUtilsTest(absltest.TestCase):
27
+
28
+ def test_get_api_key_from_environment(self):
29
+ with mock.patch.dict(os.environ, {_TEST_SECRET_KEY: 'foo'}):
30
+ self.assertEqual(colab_utils.get_api_key(_TEST_SECRET_KEY), 'foo')
31
+
32
+ def test_get_api_key_from_environment_not_found_raises_error(self):
33
+ with self.assertRaisesRegex(
34
+ ValueError,
35
+ f"Cannot find API key with secret='{_TEST_SECRET_KEY}'.",
36
+ ):
37
+ _ = colab_utils.get_api_key(_TEST_SECRET_KEY)
38
+
39
+ def test_get_api_key_from_colab_secrets(self):
40
+ mock_colab = mock.MagicMock()
41
+ mock_colab.userdata.get.return_value = 'bar'
42
+
43
+ with mock.patch.dict(
44
+ sys.modules, {'google': mock.MagicMock(), 'google.colab': mock_colab}
45
+ ):
46
+ self.assertEqual(colab_utils.get_api_key(), 'bar')
47
+
48
+ def test_get_api_key_from_colab_secrets_not_found_raises_error(self):
49
+ mock_colab = mock.MagicMock()
50
+ mock_colab.userdata.NotebookAccessError = Exception
51
+ mock_colab.userdata.SecretNotFoundError = Exception
52
+ mock_colab.userdata.TimeoutException = Exception
53
+
54
+ mock_colab.userdata.get.side_effect = Exception()
55
+
56
+ with mock.patch.dict(
57
+ sys.modules, {'google': mock.MagicMock(), 'google.colab': mock_colab}
58
+ ):
59
+ secret = 'my_secret'
60
+ with self.assertRaisesRegex(
61
+ ValueError,
62
+ f'Cannot find or access API key in Colab secrets with {secret=}.',
63
+ ):
64
+ _ = colab_utils.get_api_key(secret)
65
+
66
+
67
+ if __name__ == '__main__':
68
+ absltest.main()
alphagenome/source/src/alphagenome/data/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Data classes for interacting and visualizing genomic models."""
alphagenome/source/src/alphagenome/data/fold_intervals.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Google LLC.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Genomics intervals used for training model folds."""
16
+
17
+ import enum
18
+
19
+ from alphagenome.models import dna_client
20
+ import immutabledict
21
+ import pandas as pd
22
+
23
+
24
+ _DEFAULT_EXAMPLE_REGIONS = immutabledict.immutabledict({
25
+ dna_client.Organism.HOMO_SAPIENS: (
26
+ 'https://github.com/calico/borzoi/raw/'
27
+ '5c9358222b5026abb733ed5fb84f3f6c77239b37/data/sequences_human.bed.gz'
28
+ ),
29
+ dna_client.Organism.MUS_MUSCULUS: (
30
+ 'https://github.com/calico/borzoi/raw/'
31
+ '5c9358222b5026abb733ed5fb84f3f6c77239b37/data/sequences_mouse.bed.gz'
32
+ ),
33
+ })
34
+
35
+
36
+ class Subset(enum.Enum):
37
+ """Subset of the data."""
38
+
39
+ TRAIN = 0
40
+ VALID = 1
41
+ TEST = 2
42
+
43
+
44
+ # Fold ONE is aligned with all trained Borzoi checkpoints: 3 and 4 are held out.
45
+ _VALID_FOLD = immutabledict.immutabledict({
46
+ 0: 'fold0',
47
+ 1: 'fold3',
48
+ 2: 'fold2',
49
+ 3: 'fold6',
50
+ -1: 'fold0',
51
+ })
52
+
53
+ _TEST_FOLD = immutabledict.immutabledict({
54
+ 0: 'fold1',
55
+ 1: 'fold4',
56
+ 2: 'fold5',
57
+ 3: 'fold7',
58
+ -1: 'fold1',
59
+ })
60
+
61
+ _MODEL_VERSION_TO_FOLD = immutabledict.immutabledict({
62
+ dna_client.ModelVersion.FOLD_0: 0,
63
+ dna_client.ModelVersion.FOLD_1: 1,
64
+ dna_client.ModelVersion.FOLD_2: 2,
65
+ dna_client.ModelVersion.FOLD_3: 3,
66
+ dna_client.ModelVersion.ALL_FOLDS: -1,
67
+ })
68
+
69
+
70
+ def get_all_folds() -> list[str]:
71
+ """Returns the names of all data folds."""
72
+ return [f'fold{i}' for i in range(8)]
73
+
74
+
75
+ def get_fold_names(
76
+ model_version: dna_client.ModelVersion, subset: Subset
77
+ ) -> list[str]:
78
+ """Returns the data folds used for the model version."""
79
+ match subset:
80
+ case Subset.VALID:
81
+ return [_VALID_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
82
+ case Subset.TEST:
83
+ return [_TEST_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
84
+ case Subset.TRAIN:
85
+ all_folds = get_all_folds()
86
+ if _MODEL_VERSION_TO_FOLD[model_version] == -1:
87
+ return all_folds
88
+ remove_folds = get_fold_names(
89
+ model_version, Subset.VALID
90
+ ) + get_fold_names(model_version, Subset.TEST)
91
+ for fold in remove_folds:
92
+ all_folds.remove(fold)
93
+ return all_folds
94
+ case _:
95
+ raise ValueError(f'Unknown {subset=}')
96
+
97
+
98
+ def get_fold_intervals(
99
+ model_version: dna_client.ModelVersion,
100
+ organism: dna_client.Organism,
101
+ subset: Subset,
102
+ example_regions_path: str | None = None,
103
+ ) -> pd.DataFrame:
104
+ """Returns the training intervals for the model version."""
105
+ if example_regions_path is None:
106
+ example_regions_path = _DEFAULT_EXAMPLE_REGIONS[organism]
107
+
108
+ example_regions = pd.read_csv(
109
+ example_regions_path,
110
+ sep='\t',
111
+ names=['chromosome', 'start', 'end', 'fold'],
112
+ )
113
+ return example_regions[
114
+ example_regions.fold.isin(get_fold_names(model_version, subset))
115
+ ]