htslibsam.h库使用说明

it2023-07-16  88

介绍

samtools用来处理SAM/BAM文件,包括htslib,samtools,bcftools,pysam是python语言对samtools的封装,有着完整的说明文档。但是如果要用C/C++来操作SAM/BAM文件,一定要了解htslib库,我查了很多资料,关于htslib库的说明很少,基本都要靠看源代码,以下总结一些源代码的用法,能够快速学会并使用sam.h。

使用

1. 代码阅读

typedef struct { mplp_aux_t *array; bam_mplp_t iter; int n; int *n_plp; //每个平台的reads num const bam_pileup1_t **plp; //每个平台在某个位点的所有reads faidx_t *fai; } mpileup_group_t;

 malp_aux_t

//每个bam文件 typedef struct { /* data */ samFile *fp; //文件指针 hts_itr_t *iter; //每次返回一条比对结果 hts_idx_t *idx; //所以 bam_hdr_t *h; //头部信息 } mplp_aux_t; typedef struct{ //只列出一部分 int tid; int curr_tid; } hts_itr_t; typedef struct { //只展示一部分 int32_t n_targets; //ref个数 uint32_t l_text; //head长度 uint32_t *target_len; //ref长度 char **target_name; //ref name } bam_hdr_t;

bam_pileup1_t 

覆盖某个位点的所有reads

存储位点信息

typedef struct { bam1_t *b; int32_t qpos; //alignment在read的位置 int indel, level; uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; } bam_pileup1_t;

 存储bam信息

data存放的是二进制数据,用来解析cigar,获取碱基等

typedef struct { bam1_core_t core; //read详细信息 int l_data, m_data; uint8_t *data; #ifndef BAM_NO_ID uint64_t id; #endif } bam1_t;

 core用来存储偏移信息

typedef struct { int32_t tid; //染色体 int32_t pos; //坐标 uint32_t bin:16, qual:8, l_qname:8; uint32_t flag:16, n_cigar:16; int32_t l_qseq; int32_t mtid; //mate tid int32_t mpos; //mate pos int32_t isize; //insert size } bam1_core_t;

2. 简单功能 两种处理方式

pileup,按照位点遍历出覆盖位点的所有reads

//初始化,只列关键代码 std::vector<char *> v_bams; //bam路径数组 mpileup_group_t *group = (mpileup_group_t *)calloc(1, sizeof(mpileup_group_t)); group->array = (mplp_aux_t **)calloc(vbams.size(), sizeof(mplp_aux_t *)); //(void **) group->plp = (const bam_pileup1_t **)calloc(vbams.size(), sizeof(bam_pileup1_t *)); group->n_plp = (int *)calloc(vbams.size(), sizeof(int)); group->n = vbams.size(); group->fai = fai_load(ref); //i为平台index group->array[i]->fp = sam_open(vbams[i], "r"); group->array[i]->h = sam_hdr_read(group->array[i]->fp); group->array[i]->idx = sam_index_load(group->array[i]->fp, vbams[i]); group->array[i]->iter = sam_itr_querys(group->array[i]->idx, group->array[i]->h, reg); group->iter = bam_mplp_init(vbams.size(), readaln,(void **) group->array); //处理 mpileup_group_t *group; //一个位点一个位点的遍历 //tid pos会被赋值,n_plp说明各个平台platfrom覆盖位点的reads数目 //ret 大于0,说明bam没有结束 int tid, pos; ret = bam_mplp_auto(group->iter, &tid, &pos, group->n_plp, group->plp)) int platform; //说明是第几个平台 //tid int->char //platform说明是第几个平台 char *chrom = group->array[platform]->h->target_name[tid]; //得到某个区间的ref碱基 int len = 0; char * windows_ref_base =faidx_fetch_seq(group->fai, chrom, pos-50, pos+50, &len); const bam_pileup1_t *piles = group->plp[platform] //n = reads num int n = group->n_plp[platform]; for(int i = 0; i < n; i++) { const bam_pileup1_t *p = piles + i; p->is_del //是否是deletion p->indel == 0 //无indel突变 p->indel > 0 //ins p->indel < 0 //del p->b->core.qual //mapping quality int l_qseq = p->b->core.l_qseq; //read序列长度 p->b->core.flag //read flag uint32_t *cigar = bam_get_cigar(p->b); // cigar bam->core.isize //insert size int n_cigar = p->b->core.n_cigar; //cigar number // if (bam_cigar_op(cigar[0])== BAM_CSOFT_CLIP) {} //某个位置的cigar uint8_t *qual = bam_get_qual(p->b); //得到read所有碱基对应的quality // bam_get_seq(p->b) // 返回read的碱基 bam_seqi(bam_get_seq(p->b), p->qpos + j) //某个位置的碱基,但是返回的是数值 (char)seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]//某个位置的碱基,但是返回的是ACGT // uint8_t* s = bam_aux_get(p->b,"NM") //get mismatch int64_t n_mis = bam_aux2i(s); //mismatch数值 // bam_is_rev(p->b) //是否为负链 }

一行一行的读bam

samFile *fp = sam_open(vbams[platform], "r"); bam_hdr_t *h = sam_hdr_read(fp); hts_idx_t *idx = sam_index_load(fp, vbams[platform]); hts_itr_t *iter = bam_itr_querys(idx,h,regs); // bam1_t *bam = bam_init1(); //int read_num = 0; //会把每一行内容复制给bam while (sam_itr_next(fp, iter, bam)>0) { uint8_t* s = bam_aux_get(bam,"NM");//get mismatch uint8_t* xa = bam_aux_get(bam,"XA"); bam->core.flag //和pileup一样的用法 }

 

最新回复(0)