diff --git a/vcf/src/headers.rs b/vcf/src/headers.rs index 4e8f292..b5da98f 100644 --- a/vcf/src/headers.rs +++ b/vcf/src/headers.rs @@ -1,15 +1,15 @@ use std::collections::HashMap; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Header<'src> { - pub key: &'src str, - pub value: HeaderValue<'src>, +pub struct Header { + pub key: String, + pub value: HeaderValue, } #[derive(Debug, Clone, PartialEq, Eq)] -pub enum HeaderValue<'src> { - Flat(&'src str), - Nested(HashMap<&'src str, &'src str>), +pub enum HeaderValue { + Flat(String), + Nested(HashMap), } #[cfg(test)] @@ -34,15 +34,15 @@ mod tests { headers, vec![ Header { - key: "fileformat", - value: HeaderValue::Flat("VCFv1.4"), + key: "fileformat".to_string(), + value: HeaderValue::Flat("VCFv1.4".to_string()), }, Header { - key: "INFO", + key: "INFO".to_string(), value: HeaderValue::Nested(HashMap::from([ - ("abc", "123"), - ("xyz", "3125"), - ("sfh", "574"), + ("abc".to_string(), "123".to_string()), + ("xyz".to_string(), "3125".to_string()), + ("sfh".to_string(), "574".to_string()), ])), }, ], @@ -58,11 +58,11 @@ mod tests { header, Ok( Header { - key: "FORMAT", + key: "FORMAT".to_string(), value: HeaderValue::Nested(HashMap::from([ - ("abc", "123"), - ("xyz", "3125"), - ("sfh", "1,574"), + ("abc".to_string(), "123".to_string()), + ("xyz".to_string(), "3125".to_string()), + ("sfh".to_string(), "1,574".to_string()), ])), } ) @@ -78,11 +78,11 @@ mod tests { header, Ok( Header { - key: "FORMAT", + key: "FORMAT".to_string(), value: HeaderValue::Nested(HashMap::from([ - ("abc", "1,233"), - ("xyz", "3125"), - ("sfh", "157"), + ("abc".to_string(), "1,233".to_string()), + ("xyz".to_string(), "3125".to_string()), + ("sfh".to_string(), "157".to_string()), ])), } ) diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index dfa33dd..e79117a 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -12,21 +12,30 @@ lazy_static! { static ref HEADER_VALUE_REGEX: Regex = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap(); } -impl<'src> Header<'src> { - pub fn parse(input: &'src str) -> Result { +pub fn convert_to_string(hm: HashMap<&str, &str>) -> HashMap { + hm + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect::>() +} + + +impl Header { + pub fn parse(input: &str) -> Result { + println!("Parsing header input: {}", input); let line = input.trim(); let (key, value) = line.strip_prefix("##") .and_then(|line| line.split_once('=')) .ok_or(ParseError)?; let value = HeaderValue::parse(value)?; - Ok(Self { key, value }) + Ok(Self { key: key.to_string(), value: value }) } } -impl<'src> HeaderValue<'src> { - pub fn parse(input: &'src str) -> Result { +impl HeaderValue { + pub fn parse(input: &str) -> Result { match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) { - None => Ok(Self::Flat(input)), + None => Ok(Self::Flat(input.to_string())), Some(pairs) => { HEADER_VALUE_REGEX.captures_iter(pairs) .map(|c| c.get(0).unwrap().as_str()) @@ -38,6 +47,7 @@ impl<'src> HeaderValue<'src> { } ) .collect::, _>>() + .map(convert_to_string) .map(HeaderValue::Nested) } } diff --git a/vcf/src/validate_fileformat.rs b/vcf/src/validate_fileformat.rs index 843a356..714065f 100644 --- a/vcf/src/validate_fileformat.rs +++ b/vcf/src/validate_fileformat.rs @@ -25,19 +25,22 @@ mod tests { #[test] fn is_valid_if_key_is_fileformat() { - let header = Header {key: "fileformat", value: Flat("VCFv4.4")}; + let header = Header {key: "fileformat".to_string(), value: Flat("VCFv4.4".to_string())}; assert!(is_valid_file_format(&header)); } #[test] fn is_invalid_if_key_is_not_fileformat() { - let header = Header {key: "gileformat", value: Flat("VCFv4.4")}; + let header = Header {key: "gileformat".to_string(), value: Flat("VCFv4.4".to_string())}; assert!(!is_valid_file_format(&header)); } #[test] fn is_invalid_if_header_value_nested() { - let header = Header {key: "fileformat", value: Nested(HashMap::from([("another_key", "VCFv4.4")])) }; + let header = Header { + key: "fileformat".to_string(), + value: Nested(HashMap::from([("another_key".to_string(), "VCFv4.4".to_string())])) + }; assert!(!is_valid_file_format(&header)); } } diff --git a/vcf/src/vcf.rs b/vcf/src/vcf.rs index dbea8d7..0338976 100644 --- a/vcf/src/vcf.rs +++ b/vcf/src/vcf.rs @@ -7,6 +7,7 @@ use crate::parse; pub struct VCF { pub file_format: String, + pub format: Vec
, } #[derive(Debug)] @@ -61,7 +62,10 @@ impl From for VCFError { /// 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 /// "#; ///# use vcf::vcf::VCFError; -/// let vcf = parse_vcf(&vcf_source[..])?; +/// let vcf = match parse_vcf(&vcf_source[..]) { +/// Ok(vcf) => vcf, +/// Err(_) => panic!("Error when we should be ok") +/// }; /// assert_eq!(vcf.file_format, "VCFv4.4"); ///# Ok::<(), VCFError>(()) /// ``` @@ -101,15 +105,93 @@ impl From for VCFError { /// _ => assert!(false), /// }; /// ``` +/// +/// Similarly, we can obtain the format information for a file via the `format` attribute. +/// +/// ``` +/// use std::collections::HashMap; +/// use vcf::vcf::parse_vcf; +/// use vcf::{Header, HeaderValue}; +/// let vcf_source = br#"##fileformat=VCFv4.4 +/// ###fileDate=20090805 +/// ###source=myImputationProgramV3.1 +/// ###reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +/// ###contig= +/// ###phasing=partial +/// ###INFO= +/// ###INFO= +/// ###INFO= +/// ###INFO= +/// ###INFO= +/// ###INFO= +/// ###FILTER= +/// ###FILTER= +/// ###FORMAT= +/// ###FORMAT= +/// ###FORMAT= +/// ###FORMAT= +/// ##CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +/// 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +/// 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +/// 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +/// 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +/// 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 +/// "#; +///# use vcf::vcf::VCFError; +/// let vcf = parse_vcf(&vcf_source[..])?; +/// let hq = vcf.format +/// .iter() +/// .find( +/// |item| match &item.value { +/// HeaderValue::Nested(d) => match d.get("ID") {Some(v) => v == "HQ", _ => false}, +/// _ => false +/// } +/// ).unwrap(); +/// assert_eq!( +/// *hq, +/// Header { +/// key: "FORMAT".to_string(), +/// value: HeaderValue::Nested(HashMap::from([ +/// ("ID".to_string(), "HQ".to_string()), +/// ("Number".to_string(), "2".to_string()), +/// ("Type".to_string(), "Integer".to_string()), +/// ("Description".to_string(), "Haplotype Quality".to_string()), +/// ])) +/// } +/// ); +///# Ok::<(), VCFError>(()) +/// ``` pub fn parse_vcf(source: impl BufRead) -> Result { - let first_line = source.lines().next().ok_or(VCFError::ParseError)??; + let mut lines = source.lines(); + let first_line = lines.next().ok_or(VCFError::ParseError)??; let parsed = Header::parse(&first_line)?; - if is_valid_file_format(&parsed) { - match parsed.value { - Flat(s) => Ok(VCF {file_format: s.to_string()}), - _ => panic!(), - } - } else { - Err(VCFError::ParseError) + if !is_valid_file_format(&parsed) { + return Err(VCFError::ParseError) } + let file_format = match parsed.value { + Flat(s) => s.to_string(), + _ => panic!(), + }; + let formats = lines + .take_while(|s| match s { Ok(s) => s.starts_with("##"), _ => true}) + .map( + |result| match result { + Ok(ref line) => Header::parse(line).map_err(VCFError::from), + Err(e) => Err(VCFError::IoError(e)), + } + ) + .filter( + |result| match result { + Ok(header) if header.key == "FORMAT" => true, + Err(_) => true, + _ => false, + } + ) + .collect::, _>>()?; + Ok( + VCF { + file_format: file_format.to_string(), + format: formats, + } + ) }