<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Manuel Corpas&#039; Blog &#187; Tutorials</title>
	<atom:link href="http://manuelcorpas.com/category/tutorials/feed/" rel="self" type="application/rss+xml" />
	<link>http://manuelcorpas.com</link>
	<description>Genomes, Web 2.0 and Bioethics</description>
	<lastBuildDate>Wed, 23 May 2012 15:51:10 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='manuelcorpas.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://1.gravatar.com/blavatar/30b9f4f8115fc52af0bb4d7d67d33f7d?s=96&#038;d=http%3A%2F%2Fs2.wp.com%2Fi%2Fbuttonw-com.png</url>
		<title>Manuel Corpas&#039; Blog &#187; Tutorials</title>
		<link>http://manuelcorpas.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://manuelcorpas.com/osd.xml" title="Manuel Corpas&#039; Blog" />
	<atom:link rel='hub' href='http://manuelcorpas.com/?pushpress=hub'/>
		<item>
		<title>Converting FASTQ to FASTA</title>
		<link>http://manuelcorpas.com/2012/05/21/converting-fastq-to-fasta/</link>
		<comments>http://manuelcorpas.com/2012/05/21/converting-fastq-to-fasta/#comments</comments>
		<pubDate>Mon, 21 May 2012 15:17:57 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[Genomics]]></category>
		<category><![CDATA[code]]></category>
		<category><![CDATA[fasta]]></category>
		<category><![CDATA[fastq]]></category>
		<category><![CDATA[perl]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=1559</guid>
		<description><![CDATA[A little Perl one liner I borrowed from The Edwards Lab that converts FASTQ to FASTA. Please note I had to truncate the line to make it show properly in this blog entry. $ cat file_to_covert.fq &#124; perl -e \ '$i=0;while(&#60;&#62;){if(/^\@/&#38;&#38;$i==0){s/^\@/\&#62;/;print;}elsif($i==1){print;$i=-3}$i++;}' \ &#62; output.fasta Thanks Edwards Lab!<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1559&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>A little Perl one liner I borrowed from <a href="http://edwards.sdsu.edu/labsite/index.php/robert/289-how-to-convert-fastq-to-fasta" target="_blank">The Edwards Lab</a> that converts FASTQ to FASTA. Please note I had to truncate the line to make it show properly in this blog entry.</p>
<pre>$ cat file_to_covert.fq | perl -e \
'$i=0;while(&lt;&gt;){if(/^\@/&amp;&amp;$i==0){s/^\@/\&gt;/;print;}elsif($i==1){print;$i=-3}$i++;}' \
&gt; output.fasta</pre>
<p>Thanks Edwards Lab!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/1559/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/1559/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/1559/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1559&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2012/05/21/converting-fastq-to-fasta/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>Nightmare Naming Conventions</title>
		<link>http://manuelcorpas.com/2012/03/22/nightmare-naming-conventions/</link>
		<comments>http://manuelcorpas.com/2012/03/22/nightmare-naming-conventions/#comments</comments>
		<pubDate>Thu, 22 Mar 2012 13:28:24 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Lectures]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[computer science]]></category>
		<category><![CDATA[Data Analysis]]></category>
		<category><![CDATA[directory]]></category>
		<category><![CDATA[file]]></category>
		<category><![CDATA[next generation sequencing]]></category>
		<category><![CDATA[pipeline]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=1486</guid>
		<description><![CDATA[One of the tasks I seem to be spending a lot time thinking about these days is how to name files and structure them in the appropriate directories so that they follow a consistent logic. This is because my current research involves development of analysis pipelines of Next Generation Sequencing Data where the output file(s) [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1486&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>One of the tasks I seem to be spending a lot time thinking about these days is how to name files and structure them in the appropriate directories so that they follow a consistent logic. This is because my current research involves development of analysis pipelines of Next Generation Sequencing Data where the output file(s) of a program(s) is the input to the next. These processing steps allow raw data straight out of the machine to help answer the biological questions for which the experiments were run on the first place.</p>
<p>File and directory naming conventions may sound like a trivial thing to do but I have found that their complexity increases exponentially when many components are run. To illustrate my current approach to tackling this problem, I present here a simple example. Suppose a project (&#8216;project_name&#8217;) that runs two programs, &#8216;program_1&#8242; and &#8216;program_2&#8242;. Each time the pipeline is run, input files may vary and so I create a new &#8216;job_name&#8217; for each run. I have come up with this directory architecture:</p>
<pre>/project_name
/project_name/data
/project_name/data/job_name_1
/project_name/data/job_name_1/input_data_type_1
/project_name/data/job_name_1/input_data_type_2
/project_name/data/job_name_1/input_data_type_3
/project_name/results
/project_name/results/job_name_1/program_1
/project_name/results/job_name_1/program_1/output_1
/project_name/results/job_name_1/program_1/output_2
...
/project_name/results/job_name/program_2/output_1
/project_name/results/job_name/program_2/output_1
...</pre>
<p>What would happen if instead of running 2 programs as I did above I run 5 or 6? And what if for each input data file I had replicates? What about maximising the number steps taken in parallel? You can start to see that the thing really gets complicated.</p>
<p>File and directory naming conventions is something that I am teaching myself, but any directives or systematic methods taught during my computer science student years would have come in handy now. In future bioinformatics lectures I teach I will definitively challenge my students to think about this issue very carefully.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/1486/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/1486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/1486/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1486&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2012/03/22/nightmare-naming-conventions/feed/</wfw:commentRss>
		<slash:comments>9</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>Converting Genes and Genomic Features From NCBI36 to GRCh37</title>
		<link>http://manuelcorpas.com/2012/01/10/converting-genes-and-genomic-features-between-ncbi36-to-grch37/</link>
		<comments>http://manuelcorpas.com/2012/01/10/converting-genes-and-genomic-features-between-ncbi36-to-grch37/#comments</comments>
		<pubDate>Tue, 10 Jan 2012 15:00:12 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Genomics]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[gene]]></category>
		<category><![CDATA[GRCh37]]></category>
		<category><![CDATA[ncbi36]]></category>
		<category><![CDATA[remap]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=1273</guid>
		<description><![CDATA[The Human Genome is a like map where features and genes are mapped to. As techniques improve, our fine-grained resolution for that map increases and new versions are released every few years. When a new coordinate reference map (or assembly) for the Human Genome is released, it produces lots of headaches for those who work [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1273&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>The Human Genome is a like map where features and genes are mapped to. As techniques improve, our fine-grained resolution for that map increases and new versions are released every few years. When a new coordinate reference map (or assembly) for the Human Genome is released, it produces lots of headaches for those who work in the field as it means that the locations of genes, chromosomal bands and other features like Single Nucleotide Polymorphisms (SNPs) or Copy Number Variation (CNVs) change.</p>
<p>In order to have the most up-to-date version for the Human Genome set of genes and features sometimes it is necessary to convert from one assembly to another. In the past I have written a tutorial on how to <a href="http://manuelcorpas.com/2011/02/02/838/" target="_blank">remap from NCBI36 to GRCh37 human assemblies using liftOver</a>. In this tutorial I present a simple step-by-step guide for feature remapping using NCBI&#8217;s remapping tool.</p>
<p><strong>Important:</strong></p>
<p>Please make sure you know in advance the assembly to which your aberration data is currently mapped to. If by mistake you remap an aberration already in GRCh37 to GRCh37 you will get new coordinates for the region mapped to the wrong coordinates.</p>
<p>The NCBI provides a web facility to convert coordinates from one assembly into another. To convert coordinates using their genome remapping service do the following:</p>
<ol>
<li>Make sure that your data is in BED format,  e.g. “chr3            100000 999990 myId0000123” -&gt; CNV aberration in NCBI36/hg18</li>
</ol>
<ul>
<li>Please note that each field is separated by a tab and each line by a character return. Please follow this strictly or the remapping tool may throw an error.</li>
<li>Add as many lines as aberrations you would like to remap</li>
</ul>
<ol>
<li>Go to the NCBI Remap page:</li>
</ol>
<ul>
<li><a href="http://www.ncbi.nlm.nih.gov/genome/tools/remap/" rel="nofollow">http://www.ncbi.nlm.nih.gov/genome/tools/remap/</a></li>
</ul>
<ol>
<li>Select “Organism for source data” Homo Sapiens, “Source Assembly” NCBI36 (hg18) and “Target Assembly” GRCh37 (hg19)</li>
<li>Please leave all “Remapping Options” (Minimum ratio of bases that must remap, etc) with default values</li>
<li>Select for “Input format” BED, “Output format” Same as input</li>
<li>Paste your aberration in the input box where it says “Paste data here” and hit submit at the bottom of the page</li>
<li>Wait until results are returned</li>
<li>To retrieve results download “Mapping Report”, which is in excel format or alternatively Mapping report Sample in the results page</li>
</ol>
<p><a href="http://corpasfoo.files.wordpress.com/2012/01/screen-shot-2012-01-10-at-11-02-50-am.png"><img class="aligncenter size-full wp-image-1274" title="result remapping tool" src="http://corpasfoo.files.wordpress.com/2012/01/screen-shot-2012-01-10-at-11-02-50-am.png?w=480" alt=""   /></a></p>
<p>Please note that your aberration may remap to more than one location. I recommend that you manually check the coordinates and select the most appropriate of the doubly remapped aberration in the new assembly. Please also note that your aberration may not remap because the region is partially or entirely deleted in the new assembly or split in GRCh37. In this case I recommend that you use another start or end point position, maybe use the start/end of alternative probes until you find a region where it maps.</p>
<p>Another possibility could be to look at the genes for the region in the old assembly and select a region in GRCh37 that includes the same genes as in NCBI36. Each of these solutions requires careful deliberation and may not be applicable to your particular case (e.g. genes in different chromosomes would not allow remapping based on genes).</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/1273/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/1273/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/1273/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1273&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2012/01/10/converting-genes-and-genomic-features-between-ncbi36-to-grch37/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>

		<media:content url="http://corpasfoo.files.wordpress.com/2012/01/screen-shot-2012-01-10-at-11-02-50-am.png" medium="image">
			<media:title type="html">result remapping tool</media:title>
		</media:content>
	</item>
		<item>
		<title>Beware of Gene Names in Excel</title>
		<link>http://manuelcorpas.com/2011/11/05/beware-of-gene-names-in-excel/</link>
		<comments>http://manuelcorpas.com/2011/11/05/beware-of-gene-names-in-excel/#comments</comments>
		<pubDate>Sat, 05 Nov 2011 21:00:27 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Genomics]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[Excel]]></category>
		<category><![CDATA[HGNC]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=1201</guid>
		<description><![CDATA[For the past few days I have been trying to compile the list of gene names that is the most complete possible. To start with, I was given an initial list of genes in an excel file that was taken from the HUGO Gene Nomenclature Committee (HGNC). Unfortunately, the gene names were pasted from the [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1201&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>For the past few days I have been trying to compile the list of gene names that is the most complete possible. To start with, I was given an initial list of genes in an excel file that was taken from the <a href="http://www.genenames.org/" target="_blank">HUGO Gene Nomenclature Committee</a> (HGNC). Unfortunately, the gene names were pasted from the original source (HGNC) to an Excel spreadsheet without modifying the expected format of the column cells. This led to Excel trying to &#8220;help&#8221; with the formatting of the value inserted, changing those gene names that are similar to dates to an actual date. In the bioinformatics field, misnaming a gene can lead to disastrous consequences such as misdiagnosis of a causal gene in a clinical setting. Thus:</p>
<h4>Beware of pasting gene names in an Excel spreadsheet with a default format, as these may be changed into dates.</h4>
<p>From my current list of 19,026 genes that I have compiled as of now, here are the names of the genes that have been automatically changed by Excel into dates. In the table below, the first column denotes the date the gene name is changed to, the middle column the <a href="http://www.ensembl.org" target="_blank">Ensembl</a> ID of the gene and the right column the actual name that was changed by Excel into a date.</p>
<pre>Sep-01    ENSG00000180096        SEPT1    
Sep-02    ENSG00000168385        SEPT2
Sep-03    ENSG00000100167        SEPT3
Sep-04    ENSG00000108387        SEPT4
Sep-05    ENSG00000184702        SEPT5
Sep-06    ENSG00000125354        SEPT6
Sep-07    ENSG00000122545        SEPT7
Sep-08    ENSG00000164402        SEPT8
Sep-09    ENSG00000184640        SEPT9
Sep-10    ENSG00000186522        SEPT10
Sep-11    ENSG00000138758        SEPT11
Sep-12    ENSG00000140623        SEPT12
Sep-14    ENSG00000154997        SEPT14

Mar-01    ENSG00000145416        MARCH1
Mar-02    ENSG00000099785        MARCH2
Mar-03    ENSG00000173926        MARCH3
Mar-04    ENSG00000144583        MARCH4
Mar-05    ENSG00000198060        MARCH5
Mar-06    ENSG00000145495        MARCH6
Mar-07    ENSG00000136536        MARCH7
Mar-08    ENSG00000165406        MARCH8
Mar-09    ENSG00000139266        MARCH9
Mar-10    ENSG00000173838        MARCH10
Mar-11    ENSG00000183654        MARCH11

Dec-01    ENSG00000173077        DEC1</pre>
<p>&nbsp;</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/1201/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/1201/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/1201/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=1201&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2011/11/05/beware-of-gene-names-in-excel/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>Remapping from NCBI36/hg18 to GRCh37/hg19</title>
		<link>http://manuelcorpas.com/2011/02/02/838/</link>
		<comments>http://manuelcorpas.com/2011/02/02/838/#comments</comments>
		<pubDate>Wed, 02 Feb 2011 17:43:13 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[GRCh37]]></category>
		<category><![CDATA[remap]]></category>
		<category><![CDATA[ucsc]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=838</guid>
		<description><![CDATA[Given the huge response I have at work about remapping features into another assembly, I present here an adapted version for how to remap a feature from NCBI36/hg18 to GRCh37/hg19 using UCSC&#8217;s liftOver tool. Important: Please make sure you know in advance the assembly to which your aberration data is currently mapped to. If by [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=838&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Given the huge response I have at work about remapping features into another assembly, I present here an adapted version for how to remap a feature from NCBI36/hg18 to GRCh37/hg19 using UCSC&#8217;s liftOver tool.</p>
<h3>Important:</h3>
<p>Please make sure you know in advance the assembly to which your aberration data is currently mapped to. If by mistake you remap an aberration already in GRCh37 to GRCh37 you will get new coordinates for the region mapped to the wrong coordinates.<br />
UCSC’s Genome Browser provides a web facility to convert coordinates from one assembly into another. To convert coordinates using their liftOver tool do the following:</p>
<ol>
<li>Make sure that your data is in BED format, e.g.  “chr3     100000  999990  myPatientId0000123” &#8211;&gt; aberration in NCBI36/hg18</li>
<li>Note that each field is separated by a tab and each line by a character return. Please follow this strictly or the remapping tool may throw an error.</li>
<li>Add as many lines as aberrations you would like to remap.</li>
<li>Go to the <a href="http://genome.ucsc.edu/cgi-bin/hgLiftOver">liftOver page</a></li>
<li>Select “Original Assembly” Mar. 2006 (NCBI36/hg18) and “New Assembly” Feb. 2009 (GRCh37/hg19)</li>
<li>Leave all other parameters (Minimum ratio of bases that must remap, etc) with default values</li>
<li>Paste your aberration in the input box where it says “Paste in data” and hit submit</li>
<li>To get results, scroll down the page and click on the “View Conversions” link.</li>
<li>Here is the result I get:</li>
</ol>
<pre>chr3  125000      1024990     myPatientId0000123</pre>
<p>Please note that your feature may not remap because the region is partially or entirely deleted in the new assembly or split in GRCh37. In this case I recommend that you use another start or end point position, maybe use the start/end of alternative probes until you find a region where it maps. Another possibility would be to look at the genes for the region in the old assembly and select a region in GRCh37 that includes the same genes as in NCBI36. Each of these solutions require careful deliberation and may not be applicable to your particular case (e.g. genes in different chromosomes would not allow remapping based on genes).</p>
<p>I hope this is helpful.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/838/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/838/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/838/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=838&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2011/02/02/838/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>Latest Sanger&#8217;s Public Engagement Video</title>
		<link>http://manuelcorpas.com/2011/01/26/latest-sangers-public-engagement-video/</link>
		<comments>http://manuelcorpas.com/2011/01/26/latest-sangers-public-engagement-video/#comments</comments>
		<pubDate>Wed, 26 Jan 2011 16:21:05 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Biology]]></category>
		<category><![CDATA[Lectures]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[biomedicine]]></category>
		<category><![CDATA[public engagement]]></category>
		<category><![CDATA[science]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=824</guid>
		<description><![CDATA[Here is a very entertaining video that shows in images some of the fascinating science happening within the Wellcome Trust Sanger Institute&#8216;s walls. In the story a group of high school students come to visit and learn science via its Public Engagement Programme. I consider myself very fortunate of being a contributor to these efforts, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=824&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Here is a very entertaining video that shows in images some of the fascinating science happening within the <a href="http://www.sanger.ac.uk">Wellcome Trust Sanger Institute</a>&#8216;s walls. In the story a group of high school students come to visit and learn science via its <a href="http://www.sanger.ac.uk/about/engagement/">Public Engagement Programme</a>. </p>
<p>I consider myself very fortunate of being a contributor to these efforts, whose main objective is to inspire the younger generations and bring them closer to the biomedical sciences. There is no better way of maintaining one&#8217;s own motivation to do science than to inspire others, specially the <em>would-be</em> scientists.</p>
<p>The following quote describing the role of the Public Engagement Program at Sanger is taken from its own website. I think it explains very well their mission and importance as a way of raising awareness and securing future funding for this kind of research.</p>
<blockquote><p>The role of the Wellcome Trust Sanger Institute&#8217;s Communication and  Public Engagement programme is to promote understanding of the nature, discoveries and wonder of science  and its implications for individuals and society.</p></blockquote>
<p>&nbsp;<br />
<span style="text-align:center; display: block;"><a href="http://manuelcorpas.com/2011/01/26/latest-sangers-public-engagement-video/"><img src="http://img.youtube.com/vi/tYlm_gipZc0/2.jpg" alt="" /></a></span></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/824/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/824/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/824/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=824&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2011/01/26/latest-sangers-public-engagement-video/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>Sending Sensitive Data Encrypted</title>
		<link>http://manuelcorpas.com/2010/07/08/sending-sensitive-data-encrypted/</link>
		<comments>http://manuelcorpas.com/2010/07/08/sending-sensitive-data-encrypted/#comments</comments>
		<pubDate>Thu, 08 Jul 2010 13:36:47 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Technology]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[encryption]]></category>
		<category><![CDATA[passphrase]]></category>
		<category><![CDATA[privacy]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=616</guid>
		<description><![CDATA[The other day I was asked to find a way to send sensitive clinical data to another institute. How to make sure that the data is protected and only acessible to the right people? There are two aspects of protecting data, reflecting the different risks which the data may be exposed to: data in transit [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=616&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>The other day I was asked to find a way to send sensitive clinical data to another institute. How to make sure that the data is protected and only acessible to the right people? There are two aspects of protecting data, reflecting the different  risks which the data may be exposed to:</p>
<ul>
<li>data in transit (email &#8220;in flight&#8221;, web or FTP downloads, data sets  on USB disks shipped by FedEx, etc)</li>
<li>data at rest (email arrived in recipient&#8217;s inbox, data copied to  collaborator&#8217;s working disk, etc)</li>
</ul>
<p>Here we will only explore the requirements for encrypting data in  transit. The security of the data at rest is assumed to be taken care of  by the collaborator or their IT staff, since it is outside one&#8217;s control.</p>
<p>There are various possible file transfer methods:</p>
<ul>
<li>email &#8211; suitable for small files (typically up to 5MB although  different sites impose different limits); no automatic encryption in  transit</li>
<li>FTP or non-SSL password-protected web site &#8211; suitable for large  files (in the GB range); no automatic encryption in transit</li>
<li>scp &#8211; suitable for large files; intrinsic encryption in transit;  likely to encounter firewall issues</li>
<li>password-protected SSL web site &#8211; suitable for large files;  intrinsic encryption in transit</li>
<li>USB disk &#8211; suitable for very large data sets (TB range); no  automatic encryption in transit</li>
</ul>
<p>When encryption is mandated (e.g. by a data access agreement) and the  file transfer method does not provide encryption intrinsically, it is  necessary to encrypt the data separately and transfer the encrypted file  by the chosen method.</p>
<p style="text-align:center;"><a href="http://corpasfoo.files.wordpress.com/2010/07/security.jpg"><img class="size-medium wp-image-619 aligncenter" title="security" src="http://corpasfoo.files.wordpress.com/2010/07/security.jpg?w=300&h=200" alt="" width="300" height="200" /></a></p>
<p>For ad-hoc or one-off data encryption, it is appropriate to encrypt a  data set with  a password (&#8220;symmetric encryption&#8221;, because the same  password is used to encrypt and decrypt) which will be sent to the  recipient <strong>by a separate means</strong> to the actual data. For example, if  the data is shipped on a USB disk,  the password could be sent by  email, or given over the phone. Sending  the password with the encrypted  data defeats the object of encrypting  it!</p>
<p>For regular or scheduled data transfers, public-key encryption may be  suitable &#8211; and removes the need to send a password &#8211; but that will not  be explored here due to the extra work in creating and managing keys.</p>
<p>A suitable encryption tool on Linux systems is gpg (the <a href="http://www.gnupg.org/">GNU Privacy  Guard</a>). The simplest usage is to prepare a single file containing the  data in question using tar or zip, and then to encrypt that:</p>
<pre>$ gpg -c bigfile.tar
gpg: gpg-agent is not available in this session
Enter passphrase:
Repeat passphrase:

<pre>$ ls bigfile.tar*
bigfile.tar    bigfile.tar.gpg</pre>
<p>At this point, "bigfile.tar.gpg" is the encrypted file which is safe  to transfer by email, FTP, or any other non-encrypted method. Note that  the passphrase is not displayed while it is being entered; and that the  encrypted file is typically smaller than the original due to compression  in the encryption process. However it is necessary to have enough disk  space to contain both the original and the encrypted data  simultaneously, which may make this approach unsuitable for very large  (TB) datasets.</p>
<p>The passphrase should be chosen with the same care as a computer  login password. The Linux utility "pwgen" produces a selection of random  passwords which may be useful in selecting a suitable passphrase.</p>
<p>The recipient will decrypt the file in a similar way:</p>
<pre>$ gpg bigfile.tar.gpg
gpg: CAST5 encrypted data
gpg: gpg-agent is not available in this session
Enter passphrase:
gpg: encrypted with 1 passphrase
gpg: WARNING: message was not integrity protected</pre>
<p>Note that if the passphrase is lost then it is vanishingly unlikely that  the encrypted data can be recovered. Unless the passphrase is easily  guessable, the encryption is sufficiently strong as to defeat most  attempts to break it.</p>
<p><em>Written by Dr David Holland (<a href="http://www.sanger.ac.uk/">WTSI</a>), adapted by Manuel Corpas. Posted with Dr Holland's permission.<br />
</em></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/616/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/616/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/616/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=616&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2010/07/08/sending-sensitive-data-encrypted/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>

		<media:content url="http://corpasfoo.files.wordpress.com/2010/07/security.jpg?w=300" medium="image">
			<media:title type="html">security</media:title>
		</media:content>
	</item>
		<item>
		<title>Biomedical Community-Wide Annotation Using Wikipedia</title>
		<link>http://manuelcorpas.com/2010/06/03/biomedical-community-wide-annotation-using-wikipedia/</link>
		<comments>http://manuelcorpas.com/2010/06/03/biomedical-community-wide-annotation-using-wikipedia/#comments</comments>
		<pubDate>Thu, 03 Jun 2010 20:33:25 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Biotech]]></category>
		<category><![CDATA[Databases]]></category>
		<category><![CDATA[Technology]]></category>
		<category><![CDATA[Tutorials]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=582</guid>
		<description><![CDATA[The pace of data generation is leaving far behind our ability to convert this data into usable knowledge. Even well funded biomedical databases find it increasingly difficult to keep up to speed. In order to tackle this problem, some databases have opted for increasing automation in the way data is deposited, reducing the time needed [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=582&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>The pace of data generation is leaving far behind our ability to convert this data into usable knowledge. Even well funded biomedical databases find it increasingly difficult to keep up to speed. In order to tackle this problem, some databases have opted for increasing automation in the way data is deposited, reducing the time needed for interpreting results. The problem with this approach is that generated knowledge as a result is less accurate than manually annotated entries and of lower quality. Another potential solution has been to engage leading experts, creating a sort of consortium where they give some of their time to curate data entries that match their specialties. Unfortunately, engaging world experts in curating biomedical resources has not had a lot of success, with a few contributing a lot and many hardly ever dedicating any time to curation no matter how much they were fetched.</p>
<p>A new revolutionary idea has come from <a href="http://www.sanger.ac.uk/research/faculty/abateman/">Alex Bateman</a>&#8216;s group to engage not just the community of experts but the whole of the Internet, using Wikipedia. One of his group&#8217;s databases, <a href="http://rfam.sanger.ac.uk/">Rfam</a>, which characterises RNA families, is now providing all of its annotation via Wikipedia. Wikipedia is already the leader reference resource for all kinds of information. It possesses the know-how and capability to mediate the curation of database entries as well as managing to have extremely resounding success in terms of gathering reasonably high quality knowledge.</p>
<p>After having a persuasive discussion with Alex, I decided to give it a try myself and add my very first entry to Wikipedia, which I thought it could potentially help the database I develop outsource its <em>public/non-sensitive</em> data annotation part.</p>
<p>I copied, edited and formatted parts of a non-sensitive entry (a Syndrome description) to Wikipedia. I learnt –contrary to what I expected- that as long as one has an account and no entry exists on the topic, a page can be added on the fly. So I added a page and started editing, copying and pasting.</p>
<p>It took me a bit of time to get used to some of the conventions and formatting tags used by Wikipedia but very early on I had help from Wikipedia &#8216;agents&#8217;. It really surprised me how quickly these agents picked up my entry and immediately made me know the criteria for making sure this Wikipedia entry achieves a high standard.</p>
<p>I learnt about important concepts in the Wikipedia context such as <a href="http://simple.wikipedia.org/wiki/Wikipedia:Notability">Notability</a> and <a href="http://en.wikipedia.org/wiki/Conflict_of_interest">Conflicts of Interests</a>. Apparently one cannot write about oneself for example, and personal opinions or articles are not accepted. So far this was OK for me although problems came when one of this agents pointed at some copywriting issues: I was trying to copy an entry of a website/database.</p>
<p>Blatant copy of public content from another website is considered a copyright violation unless a correct license is put in place and one &#8216;owns&#8217; the data. In our case, the <a href="http://en.wikipedia.org/wiki/Creative_Commons_licenses">Creative Commons License</a>, which is the one we hold, was not OK because although it lets public use of the information, it does not allow alteration. This means that people would not be able to edit my Wikipedia entry.</p>
<p>I must admit I felt intimidated at this point. Despite that, I was extremely impressed with the efficacy with which agents acted as well as how quickly they responded to my queries. I can understand why they have to be so tough so that they prevent abuse.</p>
<p>Overall I feel quite satisfied with what I have learnt in the process and I am extremely eager to keep exploring the use of Wikipedia for database curation. Of course this is just a try and our adopted solution for keeping up with current annotation may be something different in the end. However, it is worth a try.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/582/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/582/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/582/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=582&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2010/06/03/biomedical-community-wide-annotation-using-wikipedia/feed/</wfw:commentRss>
		<slash:comments>9</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>A Script to Calculate GC content</title>
		<link>http://manuelcorpas.com/2010/02/03/a-script-to-calculate-gc-content/</link>
		<comments>http://manuelcorpas.com/2010/02/03/a-script-to-calculate-gc-content/#comments</comments>
		<pubDate>Wed, 03 Feb 2010 09:02:22 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Bioinformatics]]></category>
		<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[array]]></category>
		<category><![CDATA[GC content]]></category>
		<category><![CDATA[perl]]></category>
		<category><![CDATA[substr]]></category>

		<guid isPermaLink="false">http://manuelcorpas.com/?p=406</guid>
		<description><![CDATA[Intermediate Perl GC content is a very interesting property of DNA sequences because it is correlated to repeats and gene deserts. A simple way to calculate GC content is to divide the sum of G and C letters by the total number of nucleotides in the sequence. Let&#8217;s assume that you start with a string [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=406&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<h3>Intermediate Perl</h3>
<p>GC content is a very interesting property of DNA sequences because it is correlated to repeats and gene deserts. A simple way to calculate GC content is to divide the sum of G and C letters by the total number of nucleotides in the sequence. Let&#8217;s assume that you start with a string $sequence.</p>
<p>The <strong>WRONG</strong> way in which I initially did this was to convert the string to an array of letters, as shown here:</p>
<pre>#WRONG METHOD 1
sub calcgc {
 my $seq = $_[0];
 my @seqarray = split('',$seq);
 my $count = 0;
 foreach my $base (@seqarray) {
   $count++ if $base =~ /[G|C]/i;
 }
 my $len = $#seqarray+1;
 my $num=$count / $len;
 my ($dec)=$num =~ /(\S{6})/;
 return $dec;
}</pre>
<p>This is a very inefficient way of calculating the GC content, because arrays in Perl are quite expensive in terms of memory. The result of this was that I run out of memory quite quickly.</p>
<p>I found a more efficient approach by using the <strong><em>substr</em></strong> function, looping through the whole sequence, taking one base at a time. However, according to a colleague, <a href="http://www.ebi.ac.uk/Information/Staff/person_maint.php?s_person_id=883">Andy Jenkinson</a>,  <strong>it contains some bugs</strong>:</p>
<pre>#WRONG METHOD 2
sub calcgc {
 my $seq = $_[0];
 my $count = 0;
 my $len   = length($seq);
 for (my $i = 1;$i&lt;$len+1; $i++) {
   <strong>my $base = substr $seq, $i, 1;</strong>
 $count++ if $base =~ /[G|C]/i;
 }
 my $num=$count / $len;
 my ($dec)=$num =~ /(\S{6})/;
 return $dec;
}</pre>
<p>The reasons for being wrong, Andy argues, are that &#8220;it ignores the first character of the sequence because the substr function is zero-index based. The rounding at the end using \S{6} also only works where there are &gt;=6 characters in the resulting fraction &#8211; so a string like &#8220;ATCG&#8221; has a GC content of 0.5, but will appear to your application as zero. If you need to do this, you should use \S{0,6}.&#8221;</p>
<p>I addition to this, he adds that whilst it solves the memory issue, [one] might also consider a much more CPU-friendly and simpler implementation:</p>
<pre>#METHOD 3
<strong>$count++ while $seq =~ /[CG]/gi;</strong></pre>
<p>He carried out a test simulation of #METHOD 3 for human chromosome 1 (247 million characters), which took 12 seconds with the same memory footprint as #METHOD 2, which took 111 seconds. Here is the source code for Andy&#8217;s simulation:</p>
<pre>use strict;

sub calcgc {
 my $seq = $_[0];
 my $count = 0;
 my $len   = length($seq);
 for (my $i = 1;$i&lt;$len+1; $i++) {
 my $base = substr $seq, $i, 1;
 $count++ if $base =~ /[G|C]/i;
 }
 my $num=$count / $len;
 my ($dec)=$num =~ /(\S{6})/;
 return $dec;
}

sub calcgc2 {
 my $seq = $_[0];
 my $count = 0;
 $count++ while ($seq =~ m/[GC]/gi);
 my $num = $count / length($seq);;
 my ($dec) = $num =~ /(\S{0,6})/;
 return $dec;
}

my $seq = "CBADEFGHIJ"x (100*1000*247);
my $time1 = time();
my $gc1 = calcgc($seq);
my $time2 = time();
print "Old method: '$gc1' in ".($time2-$time1)." seconds\n";
my $gc2 = calcgc2($seq);
my $time3 = time();
print "New method: '$gc2' in ".($time3-$time2)." seconds\n";</pre>
<p>I have not had time to test #METHOD 3 yet, but I hope this last addition helps people.</p>
<p>Happy coding!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/406/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/406/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/406/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=406&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2010/02/03/a-script-to-calculate-gc-content/feed/</wfw:commentRss>
		<slash:comments>12</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
		<item>
		<title>A Simple Script to Remove Duplicate Emails</title>
		<link>http://manuelcorpas.com/2009/12/30/a-simple-script-to-remove-duplicate-emails/</link>
		<comments>http://manuelcorpas.com/2009/12/30/a-simple-script-to-remove-duplicate-emails/#comments</comments>
		<pubDate>Wed, 30 Dec 2009 11:04:58 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[duplicate]]></category>
		<category><![CDATA[eliminate]]></category>
		<category><![CDATA[email]]></category>
		<category><![CDATA[list]]></category>
		<category><![CDATA[remover]]></category>

		<guid isPermaLink="false">http://corpasfoo.wordpress.com/?p=367</guid>
		<description><![CDATA[Basic Perl Programming Suppose you have a list of emails compiled from different sources and want to get rid of duplicates. You google this and to your amazement (and mine), there is no free service available to paste your emails to return a list of unique addresses. Here is a small simple script to eliminate [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=367&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<h4>Basic Perl Programming</h4>
<p>Suppose you have a list of emails compiled from different sources and want to get rid of duplicates. You google this and to your amazement (and mine), there is no free service available to paste your emails to return a list of unique addresses.</p>
<p>Here is a small simple script to eliminate email duplicates, in case this saves a few minutes (or hours!) to someone else.</p>
<p>This script is written in Perl and takes a list of email addresses (one per line), prunes it and returns a list with unique ones. You will need to have installed Perl in your computer (you can download it for free in Windows using <a href="http://www.activestate.com/activeperl/">ActivePerl</a>); any flavors of UNIX or Mac OS should have Perl installed by default.</p>
<p>Once you have Perl installed, copy the following script to a file and save it as <em>duplicate_email_remover.pl</em> in a suitable directory<em></em>:</p>
<pre>#!perl
use strict;
my %hash;
while(my $line = &lt;&gt;) {
  chomp($line);
  $hash{$line}++;
}
map {print $_ ."\n"} keys %hash;</pre>
<p>Also make sure that you have your email list (one email per line) saved in a file named <em>emails.txt</em> in the same directory where you saved <em>duplicate_email_remover.pl</em>.</p>
<p>Run the following command in your console (or command prompt if you use Windows):</p>
<p><em>perl duplicate_email_remover.pl &lt; emails.txt</em></p>
<p>You should get printed to your console a list of emails with unique addresses.</p>
<p>Enjoy!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/corpasfoo.wordpress.com/367/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/corpasfoo.wordpress.com/367/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/corpasfoo.wordpress.com/367/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=manuelcorpas.com&#038;blog=5424602&#038;post=367&#038;subd=corpasfoo&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://manuelcorpas.com/2009/12/30/a-simple-script-to-remove-duplicate-emails/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ad7b9d282ec00a53fee5c0d293f3f425?s=96&#38;d=wavatar" medium="image">
			<media:title type="html">manuelcorpas</media:title>
		</media:content>
	</item>
	</channel>
</rss>
