본문 바로가기
Python

python VCF file 데이터 불러오기

by 코딩하는 미토콘드리아 bioinformatics 2024. 4. 16.
반응형

VCF file 데이터 불러오기 1

vcf_list = []
colID_toParse = -1
header = True
info_column_found = False #INFO column 확인

with open('sample_vcf.vcf', 'r') as fOpen:
    for i in fOpen:
        i = i.rstrip('\r\n')
        iSplit = i.split('\t')
        if header:
            header = False
            try:
                loc = iSplit.index('INFO')
                colID_toParse = int(loc)
                info_column_found = True
            except ValueError:
                print('Error, INFO column not present in header! Please check file.')
                break  # INFO column 없으면 loop exit
        else:
            if info_column_found:
                vcf_list.append(iSplit[colID_toParse])
            else:
                print('Error, INFO column not found in the header! Please check file.')
                break  # INFO column 없으면 loop exit

if info_column_found:
    for i in vcf_list:
        print(i)

 

VCF file 데이터 불러오기 2

 

def parse_vcf(vcf_file):
    vcf_data = []
    with open(vcf_file, 'r') as f:
        for line in f:
            # Skip header lines
            if line.startswith('#'):
                continue
            # Split the line into fields
            fields = line.strip().split('\t')
            # Extract relevant information 
            chromosome = fields[0]
            position = int(fields[1])
            reference = fields[3]
            alternate = fields[4]
            info = fields[7]
            # Append the extracted information to the vcf_data list
            vcf_data.append((chromosome, position, reference, alternate, info))
    return vcf_data

if __name__ == "__main__":
    vcf_file = 'sample.vcf'
    vcf_data = parse_vcf(vcf_file)
    # Print the first 10 entries in the VCF data
    print("Chromosome\tPosition\tReference\tAlternate\tInfo")
    for entry in vcf_data[:10]:
        print("\t".join(str(field) for field in entry))

 

반응형