{ "cells": [ { "cell_type": "markdown", "id": "f05643b2", "metadata": {}, "source": [ "# Tutorial 2: Analyzing G-Tensors\n", "\n", "This tutorial demonstrates how to use the python API to manipulate, analyze, and plot G-Tensor data.\n", "\n", "## Prerequisites\n", "\n", "Before starting this tutorial, ensure you have:\n", "- MuTopia package installed\n", "- Download the pre-compiled data to the `tutorial_data` directory\n", "\n", "## 1. The elements of a G-Tensor" ] }, { "cell_type": "code", "execution_count": 1, "id": "e4052080", "metadata": {}, "outputs": [], "source": [ "import mutopia.analysis as mu\n", "import mutopia.plot.track_plot as tr\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "799ed777", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
<xarray.Dataset> Size: 352MB\n",
"Dimensions: (configuration: 2, context: 96, locus: 388247,\n",
" sample: 185)\n",
"Coordinates:\n",
" * configuration (configuration) <U12 96B 'C/T-centered' 'A/G...\n",
" * context (context) <U7 3kB 'A[C>A]A' ... 'T[T>C]T'\n",
" * locus (locus) int64 3MB 0 1 2 ... 388245 388246\n",
" * sample (sample) <U36 27kB '0040b1b6-b07a-4b6e-90ef-...\n",
"Data variables: (12/25)\n",
" Features/GeneExpression (locus) float32 2MB nan nan nan ... nan nan nan\n",
" Features/GeneStrand (locus) int8 388kB 0 0 0 0 0 0 ... 0 0 0 0 0 0\n",
" Features/GenePosition (locus) float32 2MB nan nan nan ... nan nan nan\n",
" Features/ReplicationStrand (locus) int8 388kB 0 0 0 1 1 1 ... 0 0 0 0 0 0\n",
" Features/RepliseqS4 (locus) float32 2MB 7.007 2.3 ... 5.495 5.105\n",
" Features/NucleotideRatio (locus) float32 2MB 0.3104 0.2271 ... 0.239\n",
" ... ...\n",
" Regions/chrom (locus) <U5 8MB 'chr1' 'chr1' ... 'chr9' 'chr9'\n",
" Regions/start (locus) int64 3MB 810000 820000 ... 138190000\n",
" Regions/end (locus) int64 3MB 820000 830000 ... 138200000\n",
" Regions/length (locus) float32 2MB 3.141e+03 ... 3.408e+03\n",
" Regions/context_frequencies (configuration, context, locus) float32 298MB ...\n",
" Regions/exposures (locus) float64 3MB 1.0 1.0 1.0 ... 1.0 1.0 1.0\n",
"Attributes:\n",
" name: liver_simple\n",
" dtype: sbs\n",
" genome_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" fasta_file: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matc...\n",
" blacklist_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" region_size: 10000\n",
" filename: tutorial_data/Liver.nc\n",
" regions_file: Liver.nc.regions.bed<xarray.DataArray 'length' (locus: 388247)> Size: 2MB\n",
"array([ 3141., 10725., 4845., ..., 6438., 3450., 3408.], dtype=float32)\n",
"Coordinates:\n",
" * locus (locus) int64 3MB 0 1 2 3 4 ... 388242 388243 388244 388245 388246<xarray.DataArray 'Regions/length' (locus: 388247)> Size: 2MB\n",
"array([ 3141., 10725., 4845., ..., 6438., 3450., 3408.], dtype=float32)\n",
"Coordinates:\n",
" * locus (locus) int64 3MB 0 1 2 3 4 ... 388242 388243 388244 388245 388246<xarray.Dataset> Size: 35MB\n",
"Dimensions: (locus: 388247)\n",
"Coordinates:\n",
" * locus (locus) int64 3MB 0 1 2 3 ... 388243 388244 388245 388246\n",
"Data variables: (12/19)\n",
" GeneExpression (locus) float32 2MB nan nan nan nan ... nan nan nan nan\n",
" GeneStrand (locus) int8 388kB 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0\n",
" GenePosition (locus) float32 2MB nan nan nan nan ... nan nan nan nan\n",
" ReplicationStrand (locus) int8 388kB 0 0 0 1 1 1 1 1 1 ... -1 0 0 0 0 0 0 0\n",
" RepliseqS4 (locus) float32 2MB 7.007 2.3 2.088 ... 5.1 5.495 5.105\n",
" NucleotideRatio (locus) float32 2MB 0.3104 0.2271 0.1438 ... 0.239 0.239\n",
" ... ...\n",
" H3K4me3 (locus) float32 2MB 0.2934 5.248 0.3616 ... 0.1928 0.1672\n",
" RepliseqG1b (locus) float32 2MB 45.48 52.38 52.3 ... 18.0 16.9 17.2\n",
" RepliseqS2 (locus) float32 2MB 6.4 6.0 6.1 6.1 ... 29.4 29.7 31.5\n",
" RepliseqS1 (locus) float32 2MB 24.92 27.42 27.89 ... 18.4 19.49 21.0\n",
" H3K27ac (locus) float32 2MB 0.9603 7.082 ... 0.07612 0.05248\n",
" H3K27me3 (locus) float32 2MB 0.0852 0.0844 ... 0.2999 0.1846\n",
"Attributes:\n",
" name: liver_simple\n",
" dtype: sbs\n",
" genome_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" fasta_file: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matc...\n",
" blacklist_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" region_size: 10000\n",
" filename: tutorial_data/Liver.nc\n",
" regions_file: Liver.nc.regions.bed<xarray.Dataset> Size: 652MB\n",
"Dimensions: (configuration: 2, context: 96, locus: 388247,\n",
" sample: 185)\n",
"Coordinates:\n",
" * configuration (configuration) <U12 96B 'C/T-centered' 'A/G...\n",
" * context (context) <U7 3kB 'A[C>A]A' ... 'T[T>C]T'\n",
" * locus (locus) int64 3MB 0 1 2 ... 388245 388246\n",
" * sample (sample) <U36 27kB '0040b1b6-b07a-4b6e-90ef-...\n",
"Data variables: (12/27)\n",
" Features/GeneExpression (locus) float32 2MB nan nan nan ... nan nan nan\n",
" Features/GeneStrand (locus) int8 388kB 0 0 0 0 0 0 ... 0 0 0 0 0 0\n",
" Features/GenePosition (locus) float32 2MB nan nan nan ... nan nan nan\n",
" Features/ReplicationStrand (locus) int8 388kB 0 0 0 1 1 1 ... 0 0 0 0 0 0\n",
" Features/RepliseqS4 (locus) float32 2MB 7.007 2.3 ... 5.495 5.105\n",
" Features/NucleotideRatio (locus) float32 2MB 0.3104 0.2271 ... 0.239\n",
" ... ...\n",
" Regions/end (locus) int64 3MB 820000 830000 ... 138200000\n",
" Regions/length (locus) float32 2MB 3.141e+03 ... 3.408e+03\n",
" Regions/context_frequencies (configuration, context, locus) float32 298MB ...\n",
" Regions/exposures (locus) float64 3MB 1.0 1.0 1.0 ... 1.0 1.0 1.0\n",
" empirical_marginal (configuration, context, locus) float32 298MB ...\n",
" empirical_marginal_locus (locus) float32 2MB 0.0001265 3.065e-05 ... 0.0\n",
"Attributes:\n",
" name: liver_simple\n",
" dtype: sbs\n",
" genome_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" fasta_file: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matc...\n",
" blacklist_file: /n/data1/hms/dbmi/park/ctDNA_loci_project/locusregressio...\n",
" region_size: 10000\n",
" filename: tutorial_data/Liver.nc\n",
" regions_file: Liver.nc.regions.bed| feature | \n", "GeneExpression | \n", "GeneStrand | \n", "GenePosition | \n", "
|---|---|---|---|
| locus | \n", "\n", " | \n", " | \n", " |
| 0 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "
| 1 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "
| 2 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "
<xarray.Dataset> Size: 2MB\n",
"Dimensions: (configuration: 2, context: 96, locus: 388247)\n",
"Coordinates:\n",
" sample <U36 144B '0040b1b6-b07a-4b6e-90ef-133523eaf412'\n",
"Dimensions without coordinates: configuration, context, locus\n",
"Data variables:\n",
" X (configuration, context, locus) float32 2MB <GCXS: nnz=13687, fill_value=0.0>\n",
" ploidy (locus) float64 0B <COO: nnz=0, fill_value=0.0>"
],
"text/plain": [
"