<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Things I tend to forget</title>
	<atom:link href="http://jeffreybreen.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://jeffreybreen.wordpress.com</link>
	<description>if I don&#039;t write it down, I have to google for it again</description>
	<lastBuildDate>Fri, 25 Jan 2013 22:22:10 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='jeffreybreen.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Things I tend to forget</title>
		<link>http://jeffreybreen.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://jeffreybreen.wordpress.com/osd.xml" title="Things I tend to forget" />
	<atom:link rel='hub' href='http://jeffreybreen.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Slides and replay of my &#8220;Using R with Hadoop&#8221; webinar now available #rstats #hadoop</title>
		<link>http://jeffreybreen.wordpress.com/2013/01/25/slides-and-replay-of-my-using-r-with-hadoop-webinar-now-available-rstats-hadoop/</link>
		<comments>http://jeffreybreen.wordpress.com/2013/01/25/slides-and-replay-of-my-using-r-with-hadoop-webinar-now-available-rstats-hadoop/#comments</comments>
		<pubDate>Fri, 25 Jan 2013 22:22:10 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[R]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=455</guid>
		<description><![CDATA[I owe a big &#8220;thank you&#8221; to all of you who attended my webinar yesterday &#8220;Using R with Hadoop&#8221;. Revolution Analytics partnered with us at Think Big Analytics to produce the webinar, and I owe them thanks as well. For those of you who missed it, the slides and replay are now available from Revolution [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=455&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>I owe a big &#8220;thank you&#8221; to all of you who attended my webinar yesterday &#8220;Using R with Hadoop&#8221;. Revolution Analytics partnered with us at Think Big Analytics to produce the webinar, and I owe them thanks as well.</p>
<p>For those of you who missed it, the slides and replay are <a href="http://bit.ly/rhadoopw">now available from Revolution Analytics</a>.</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/16179279' width='780' height='639'></iframe>
<p>&nbsp;</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/455/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/455/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=455&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2013/01/25/slides-and-replay-of-my-using-r-with-hadoop-webinar-now-available-rstats-hadoop/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
		<item>
		<title>How to specify a MapR distro when launching Elastic MapReduce clusters with the Ruby CLI</title>
		<link>http://jeffreybreen.wordpress.com/2013/01/09/emr-map/</link>
		<comments>http://jeffreybreen.wordpress.com/2013/01/09/emr-map/#comments</comments>
		<pubDate>Wed, 09 Jan 2013 18:17:31 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tips]]></category>
		<category><![CDATA[AWS]]></category>
		<category><![CDATA[cloud computing]]></category>
		<category><![CDATA[Elastic MapReduce]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[MapR]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=453</guid>
		<description><![CDATA[Amazon&#8217;s Elastic MapReduce Ruby client allows you to specify which of the supported Hadoop distributions to use, currently either Amazon&#8217;s Apache 1.0.3-based distribution or MapR&#8217;s M3 and M5 editions. I found the CLI&#8217;s option documented at &#60;http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-mapr.html&#62;: To launch an Amazon EMR job flow with MapR using the CLI Set the &#8211;with-supported-products parameter to either mapr-m3 or [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=453&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Amazon&#8217;s <a href="http://aws.amazon.com/developertools/2264">Elastic MapReduce Ruby client</a> allows you to specify which of the supported Hadoop distributions to use, currently either Amazon&#8217;s Apache 1.0.3-based distribution or MapR&#8217;s M3 and M5 editions.</p>
<p>I found the CLI&#8217;s option documented at &lt;<a href="http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-mapr.html">http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-mapr.html</a>&gt;:</p>
<p style="padding-left:30px;">To launch an Amazon EMR job flow with MapR using the CLI</p>
<p style="padding-left:30px;">Set the &#8211;with-supported-products parameter to either <strong>mapr-m3</strong> or <strong>mapr-m5</strong> to run your job flow on the corresponding version of the MapR Hadoop distribution.</p>
<p style="padding-left:30px;">The following example launches a job flow running with the M3 Edition of MapR.</p>
<p style="padding-left:30px;">elastic-mapreduce &#8211;create &#8211;alive \<br />
&#8211;instance-type m1.xlarge –num-instances 5 \<br />
&#8211;with-supported-products mapr-m3</p>
<p style="padding-left:30px;">For additional information about launching job flows using the CLI, see the instructions for each job flow type in Create a Job Flow.</p>
<p>&nbsp;</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/453/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/453/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=453&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2013/01/09/emr-map/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
		<item>
		<title>Slides from &#8220;Tapping the Data Deluge with R&#8221; lightning talk #rstats #PAWCon</title>
		<link>http://jeffreybreen.wordpress.com/2012/10/02/tapping-the-data-deluge-with-r/</link>
		<comments>http://jeffreybreen.wordpress.com/2012/10/02/tapping-the-data-deluge-with-r/#comments</comments>
		<pubDate>Tue, 02 Oct 2012 15:56:12 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[#PAWcon]]></category>
		<category><![CDATA[data]]></category>
		<category><![CDATA[econometrics]]></category>
		<category><![CDATA[EPA]]></category>
		<category><![CDATA[Federal Reserve]]></category>
		<category><![CDATA[FRED]]></category>
		<category><![CDATA[quantitative finance]]></category>
		<category><![CDATA[quantmod]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[rstats]]></category>
		<category><![CDATA[WDI]]></category>
		<category><![CDATA[World Bank]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=441</guid>
		<description><![CDATA[Here is my presentation from last night&#8217;s Boston Predictive Analytics Meetup graciously hosted by Predictive Analytics World Boston. The talk is meant to provide an overview of (some) of the different ways to get data into R, especially supplementary data sets to assist with your analysis. All code and data files are available at github: [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=441&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Here is my presentation from last night&#8217;s<a href="http://www.meetup.com/Boston-Predictive-Analytics/events/81297972/"> Boston Predictive Analytics Meetup</a> graciously hosted by Predictive Analytics World Boston.</p>
<p>The talk is meant to provide an overview of (some) of the different ways to get data into R, especially supplementary data sets to assist with your analysis.</p>
<p>All code and data files are available at github: <a href="http://bit.ly/pawdata" rel="nofollow">http://bit.ly/pawdata</a> (<a href="https://github.com/jeffreybreen/talk-201210-data-deluge" rel="nofollow">https://github.com/jeffreybreen/talk-201210-data-deluge</a>)</p>
<p>The slides themselves are on slideshare: <a href="http://bit.ly/pawdatadeck" rel="nofollow">http://bit.ly/pawdatadeck</a> (<a href="http://www.slideshare.net/jeffreybreen/tapping-the-data-deluge-with-r" rel="nofollow">http://www.slideshare.net/jeffreybreen/tapping-the-data-deluge-with-r</a>)</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/14558274' width='780' height='639'></iframe>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/441/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/441/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=441&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2012/10/02/tapping-the-data-deluge-with-r/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
		<item>
		<title>Slides from today&#8217;s Big Data Step-by-Step Tutorials: Infrastructure series and Intro to R+Hadoop with RHadoop&#8217;s rmr</title>
		<link>http://jeffreybreen.wordpress.com/2012/03/10/big-data-step-by-step-slides/</link>
		<comments>http://jeffreybreen.wordpress.com/2012/03/10/big-data-step-by-step-slides/#comments</comments>
		<pubDate>Sat, 10 Mar 2012 17:13:42 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[airlines]]></category>
		<category><![CDATA[Amazon EC2]]></category>
		<category><![CDATA[Big Data]]></category>
		<category><![CDATA[cloud computing]]></category>
		<category><![CDATA[Cloudera]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[rstats]]></category>
		<category><![CDATA[VMware]]></category>
		<category><![CDATA[Whirr]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=425</guid>
		<description><![CDATA[Slides from the Boston Predictive Analytics Big Data Workshop tutorials:

Big Data Step-by-Step: Infrastructure 1/3: Local VM
Big Data Step-by-Step: Infrastructure 2/3: Running R and RStudio on EC2
Big Data Step-by-Step: Infrastructure 3/3: Taking it to the cloud... easily... with Whirr
Big Data Step-by-Step: Using R &#38; Hadoop (with RHadoop's rmr package)
<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=425&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Here are my presentations from today&#8217;s <a href="http://www.meetup.com/Boston-Predictive-Analytics/events/47977712/">Boston Predictive Analytics Big Data Workshop</a>.</p>
<p>All code and config files are available at github: <a href="https://github.com/jeffreybreen/tutorial-201203-big-data">https://github.com/jeffreybreen/tutorial-201203-big-data</a></p>
<p>My portion of the workshop was divided into four parts, three focusing on different infrastructure scenarios and ending with a deep dive into the rmr R package:</p>
<h1>Big Data Step-by-Step: Infrastructure 1/3: Local VM</h1>
<ul>
<strong>Starting small.</strong> Just because Big Data tools like Hadoop were designed to run at &#8220;web-scale,&#8221; across many nodes, doesn&#8217;t mean you need to build a cluster&mdash;or even dedicate a single machine&mdash;to get started. In this deck we download and install a virtual machine from Cloudera which comes complete with a functioning, single-node Hadoop installation. As long as you restrict the size of your data set appropriately, this is great way to become accustomed to Hadoop and its tools. We walk through running a Hadoop Streaming job to make sure everything works. We later use this same VM to spawn a Hadoop cluster in the cloud (see part 3).
</ul>
<iframe src='http://www.slideshare.net/slideshow/embed_code/11951017' width='780' height='639'></iframe>
<p> </p>
<hr />
<p> </p>
<h1>Big Data Step-by-Step: Infrastructure 2/3: Running R and RStudio on EC2</h1>
<p><strong>Not everyone has Big Data.</strong> Some of us have an occasional need to analyze a data set larger than comfortably fits in our existing analysis environment either due to disk, CPU, or memory constraints. For these times, launching a single, large machine in the cloud may fit the bill. This part of presentation walks through how to launch just such a machine using Amazon&#8217;s EC2 cloud computing platform. Since I tend to run R and RStudio on Linux, that&#8217;s the focus of this tutorial, but the general outline may be helpful to others as well.</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/11951082' width='780' height='639'></iframe>
<p> </p>
<hr />
<p> </p>
<h1>Big Data Step-by-Step: Infrastructure 3/3: Taking it to the cloud&#8230; easily&#8230; with Whirr</h1>
<p><strong>Scale up using the cloud.</strong> The Apache Whirr cloud management tool makes it easy to launch a Hadoop cluster on EC2. We use the Cloudera VM from presentation #1 as a launching point for the cluster and, thanks to a Whirr-generated proxy script, submit jobs and fetch results from our local VM just as before. For extra credit, we see how Whirr can save us money by bidding for excess capacity via EC2&#8242;s spot instances.</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/11951147' width='780' height='639'></iframe>
<p> </p>
<hr />
<p> </p>
<h1>Big Data Step-by-Step: Using R &amp; Hadoop (with RHadoop&#8217;s rmr package)</h1>
<p><strong>Crunching Big Data with R.</strong> Originally a Java-only ecosystem, Hadoop Streaming allows the creation of mappers, reducers, and combiners in any language which can handle stdin and stdout&mdash;but that doesn&#8217;t mean you want to have to write code to manage I/O at that level. After a quick (and undoubtedly incomplete) survey of Hadoop-related R packages, we walk through some of the abstractions and features of RHadoop&#8217;s rmr package which make it easier for R developers to get started. We walk through a sample mapper and reducer, demonstrating and documenting the native R objects which carry the data from step to step.</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/11951888' width='780' height='639'></iframe>
<p> </p>
<hr />
<p> </p>
<p>Thank you to the session&#8217;s sponsors, all the speakers, and to an interesting and engaged audience. Special thanks to John Versotek for arranging such an informative and enjoyable day, and for the opportunity to take part.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/425/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/425/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=425&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2012/03/10/big-data-step-by-step-slides/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
		<item>
		<title>Use geom_rect() to add recession bars to your time series plots #rstats #ggplot</title>
		<link>http://jeffreybreen.wordpress.com/2011/08/15/recession-bars/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/08/15/recession-bars/#comments</comments>
		<pubDate>Tue, 16 Aug 2011 03:50:56 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tips]]></category>
		<category><![CDATA[Federal Reserve]]></category>
		<category><![CDATA[FRED]]></category>
		<category><![CDATA[ggplot2]]></category>
		<category><![CDATA[quantmod]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[visualization]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=411</guid>
		<description><![CDATA[ggplot2's geom_rect() layer makes it easy to highlight portions of your graph, such as recessions on an economic time series.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=411&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p><a href="http://moderntoolmaking.blogspot.com/2011/08/forecasting-recessions.html">Zach Mayer&#8217;s work reproducing John Hussman&#8217;s Recession Warning Composite</a> prompted me to dig this trick out of my (Evernote) notebook.</p>
<p>First, let&#8217;s grab some data to plot using the very handy <code>getSymbols()</code> function from <a href="http://www.quantmod.com/">Jeffrey Ryan&#8217;s quantmod package</a>. We&#8217;ll load the U.S. unemployment rate (<code>UNRATE</code>) from the St. Loius Fed&#8217;s Federal Reserve Economic Data (<code>src="FRED"</code>) and load the time series into a <code>data.frame</code>:</p>
<p><pre class="brush: r; light: true;">
unrate = getSymbols('UNRATE',src='FRED', auto.assign=F) 
unrate.df = data.frame(date=time(unrate), coredata(unrate) )
</pre></p>
<p>Now FRED provides a <code>USREC</code> time series which we could use to draw the recessions. It&#8217;s a bit awkward, though, as it contains a boolean to flag recession months since January 1921. All we really want are the start and end dates of each recession. Fortunately, the St. Louis Fed publishes just such a table on their web site. (See the answer to &#8220;What dates are used for the US recession bars in FRED graphs?&#8221; on <a href="http://research.stlouisfed.org/fred2/help-faq/">http://research.stlouisfed.org/fred2/help-faq/</a>.) Sometimes it&#8217;s still easier to cut-and-paste (and the static table covers another 64 years, go figure):</p>
<p><pre class="brush: r; light: true;">
recessions.df = read.table(textConnection(
&quot;Peak, Trough
1857-06-01, 1858-12-01
1860-10-01, 1861-06-01
1865-04-01, 1867-12-01
1869-06-01, 1870-12-01
1873-10-01, 1879-03-01
1882-03-01, 1885-05-01
1887-03-01, 1888-04-01
1890-07-01, 1891-05-01
1893-01-01, 1894-06-01
1895-12-01, 1897-06-01
1899-06-01, 1900-12-01
1902-09-01, 1904-08-01
1907-05-01, 1908-06-01
1910-01-01, 1912-01-01
1913-01-01, 1914-12-01
1918-08-01, 1919-03-01
1920-01-01, 1921-07-01
1923-05-01, 1924-07-01
1926-10-01, 1927-11-01
1929-08-01, 1933-03-01
1937-05-01, 1938-06-01
1945-02-01, 1945-10-01
1948-11-01, 1949-10-01
1953-07-01, 1954-05-01
1957-08-01, 1958-04-01
1960-04-01, 1961-02-01
1969-12-01, 1970-11-01
1973-11-01, 1975-03-01
1980-01-01, 1980-07-01
1981-07-01, 1982-11-01
1990-07-01, 1991-03-01
2001-03-01, 2001-11-01
2007-12-01, 2009-06-01&quot;), sep=',',
colClasses=c('Date', 'Date'), header=TRUE)
</pre></p>
<p>Now the only &#8220;gotcha&#8221; is that our recession data start long before our unemployment data, so let&#8217;s trim it to match:</p>
<p>
recessions.trim = subset(recessions.df, Peak &gt;= min(unrate.df$date) )
</p>
<p>Finally, we use ggplot2&#8242;s <code>geom_line()</code> layer to draw the unemployment data and transparent (<code>alpha=0.2</code>) pink rectangles to overlay the recessions:</p>
<p><pre class="brush: r; light: true;">
g = ggplot(unrate.df) + geom_line(aes(x=date, y=UNRATE)) + theme_bw()
g = g + geom_rect(data=recessions.trim, aes(xmin=Peak, xmax=Trough, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2)
</pre></p>
<p><a href="http://jeffreybreen.files.wordpress.com/2011/08/rplot.png"><img src="http://jeffreybreen.files.wordpress.com/2011/08/rplot.png?w=600&#038;h=270" alt="" title="recession bars" width="600" height="270" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/411/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/411/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=411&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/08/15/recession-bars/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>

		<media:content url="http://jeffreybreen.files.wordpress.com/2011/08/rplot.png?w=600" medium="image">
			<media:title type="html">recession bars</media:title>
		</media:content>
	</item>
		<item>
		<title>One-liners which make me love R: twitteR&#8217;s searchTwitter() #rstats</title>
		<link>http://jeffreybreen.wordpress.com/2011/07/21/one-liners-twitter/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/07/21/one-liners-twitter/#comments</comments>
		<pubDate>Thu, 21 Jul 2011 15:00:26 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[One-liners]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[twitter]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=320</guid>
		<description><![CDATA[R reminds me a lot of English. It&#8217;s easy to get started, but very difficult to master. So for all those times I&#8217;ve spent&#8230; well, forever&#8230; trying to figure out the &#8220;R way&#8221; of doing something, I&#8217;m glad to share these quick wins. My recent R tutorial on mining Twitter for consumer sentiment wouldn&#8217;t have [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=320&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>R reminds me a lot of English. It&#8217;s easy to get started, but very difficult to master. So for all those times I&#8217;ve spent&#8230; well, forever&#8230; trying to figure out the &#8220;R way&#8221; of doing something, I&#8217;m glad to share these quick wins.</p>
<p>My recent <a href="http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/">R tutorial on mining Twitter for consumer sentiment</a> wouldn&#8217;t have been possible without Jeff Gentry&#8217;s amazing twitteR package (<a href="http://cran.r-project.org/web/packages/twitteR/">available on CRAN</a>). It does so much of the behind-the-scenes heavy lifting to access Twitter&#8217;s REST APIs, that one line of code is all you need to perform a search and retrieve the (even paginated) results:</p>
<p><pre class="brush: r; highlight: [3]; light: true;">
library(twitteR)

tweets = searchTwitter(&quot;#rstats&quot;, n=1500)
</pre></p>
<p>You can search for anything, of course, &#8220;#rstats&#8221; is just an example. (And if you&#8217;re really into that hashtag, the twitteR package even provides an <code>Rtweets()</code> function which hardcodes that search string for you.) The <code>n=1500</code> specifies the maximum number of tweets supported by the Search API, though you may retrieve fewer as Twitter&#8217;s search indices contain only a couple of days&#8217; tweets.</p>
<p>What you get back is a list of tweets (technically &#8220;status updates&#8221;):</p>
<p><pre class="brush: r; highlight: [1,20]; light: true;">
&gt; head(tweets)
[[1]]
[1] &quot;Cloudnumberscom: CloudNumbers.com &#092;&#048;23 #Rstats gets real in the cloud http://t.co/Vw4Gupr via @AddToAny&quot;

[[2]]
[1] &quot;0_h_r_1: CloudNumbers.com &#092;&#048;23 #Rstats gets real in the cloud via DecisionStats - I came across Cloudnumbers.com . ... http://tinyurl.com/5sjagjg&quot;

[[3]]
[1] &quot;cmprsk: RT I just joined the beta to run #Rstats in the cloud with cloudnumbers.com http://t.co/lvVp0YJ via @cloudnumberscom http://bit.ly/lbSruR&quot;

[[4]]
[1] &quot;0_h_r_1: I just joined the beta to run #Rstats in the cloud with cloudnumbers.com http://t.co/lvVp0YJ via @cloudnumberscom&quot;

[[5]]
[1] &quot;cmprsk: RT man, the #rstats think people I am too soft on #sas, the #sas people think I am too soft on #wps, the #wps pe... http://bit.ly/innEv8&quot;

[[6]]
[1] &quot;keepstherainoff: Thanks to @cmprsk @geoffjentry and @MikeKSmith for colour-coded #Rstats GUI advice&quot;

&gt; class(tweets[[1]])
[1] &quot;status&quot;
attr(,&quot;package&quot;)
[1] &quot;twitteR&quot;
</pre></p>
<p>Now that you have some tweets, <a href="http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/">the fun really begins</a>. To get you started, the <code>status</code> class includes a very handy <code>toDataFrame()</code> accessor method (see <code>?status</code>):</p>
<p><pre class="brush: r; light: true;">
&gt; library(plyr) 
&gt; tweets.df = ldply(tweets, function(t) t$toDataFrame() )
</pre><br />
<img src="http://jeffreybreen.files.wordpress.com/2011/07/tweets-df.png?w=780" alt="" title="tweets.df"   class="alignnone size-full wp-image-396" /><br />
<pre class="brush: r; highlight: [1]; light: true;">
&gt; str(tweets.df)
'data.frame':	131 obs. of  10 variables:
 $ text        : Factor w/ 122 levels &quot;CloudNumbers.com &#092;&#048;23 #Rstats gets real in the cloud http://t.co/Vw4Gupr via @AddToAny&quot;,..: 1 2 3 4 5 6 7 8 9 10 ...
 $ favorited   : logi  NA NA NA NA NA NA ...
 $ replyToSN   : logi  NA NA NA NA NA NA ...
 $ created     : POSIXct, format: &quot;2011-07-04 13:50:39&quot; &quot;2011-07-04 13:48:10&quot; &quot;2011-07-04 13:29:00&quot; &quot;2011-07-04 13:23:42&quot; ...
 $ truncated   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ replyToSID  : logi  NA NA NA NA NA NA ...
 $ id          : Factor w/ 131 levels &quot;87941406873751552&quot;,..: 1 2 3 4 5 6 7 8 9 10 ...
 $ replyToUID  : logi  NA NA NA NA NA NA ...
 $ statusSource: Factor w/ 17 levels &quot;&amp;lt;a href=&amp;quot;http://twitter.com/tweetbutton&amp;quot; rel=&amp;quot;nofollow&amp;quot;&amp;gt;Tweet Button&amp;lt;/a&amp;gt;&quot;,..: 1 2 3 1 3 4 5 5 3 4 ...
 $ screenName  : Factor w/ 64 levels &quot;Cloudnumberscom&quot;,..: 1 2 3 2 3 4 2 5 3 6 ...
</pre></p>
<p>You can pull a particular user&#8217;s tweets just as easily with the <code>userTimeline()</code> function. Heck, the package even lets you tweet from R if you use <a href="http://cran.r-project.org/web/packages/ROAuth/">Jeff&#8217;s companion ROAuth package</a>, but that requires more than one line&#8230;.</p>
<p>Enjoy!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/320/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/320/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=320&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/07/21/one-liners-twitter/feed/</wfw:commentRss>
		<slash:comments>6</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>

		<media:content url="http://jeffreybreen.files.wordpress.com/2011/07/tweets-df.png" medium="image">
			<media:title type="html">tweets.df</media:title>
		</media:content>
	</item>
		<item>
		<title>Use Dropbox&#8217;s public folder for web publishing via Notepad (or emacs or&#8230;)</title>
		<link>http://jeffreybreen.wordpress.com/2011/07/19/web-publishing-via-dropbox/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/07/19/web-publishing-via-dropbox/#comments</comments>
		<pubDate>Tue, 19 Jul 2011 16:12:39 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tips]]></category>
		<category><![CDATA[Dropbox]]></category>
		<category><![CDATA[web development]]></category>
		<category><![CDATA[WordPress]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=342</guid>
		<description><![CDATA[Remember The Good Old Days when all you needed to host a web site was a file system and Notepad (or emacs or TeachText)? Well, I do, and I can&#8217;t say that I miss them&#8230; until last week when I tried to insert the JavaScript for some motion charts into a WordPress.com post. It&#8217;s impossible. [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=342&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Remember The Good Old Days when all you needed to host a web site was a file system and Notepad (or emacs or TeachText)? </p>
<p>Well, I do, and I can&#8217;t say that I miss them&#8230; until last week when I tried to insert the <a href="http://jeffreybreen.wordpress.com/2011/07/14/r-one-liners-googlevis/">JavaScript for some motion charts</a> into a WordPress.com post. It&#8217;s impossible. Literally. Don&#8217;t waste your time. Seriously.</p>
<p>Self-hosted WordPress blogs can use some <a href="http://wordpress.org/extend/plugins/custom-fields-shortcode/">custom field hackery</a>, but there&#8217;s no such option for us easy-way-out WordPress.com users.</p>
<h2>Dropbox to the rescue</h2>
<p>Just save your HTML page to your &#8220;Public&#8221; directory in Dropbox and it will get its own public URL which you can find in Dropbox&#8217;s context menu:</p>
<p><img src="http://jeffreybreen.files.wordpress.com/2011/07/dropbox-copy-public-link.png?w=780&#038;h=209" alt="" title="Dropbox &gt; Copy Public Link" width="780" height="209" class="alignnone size-full wp-image-381" /></p>
<p>It&#8217;s not the ideal embedding I was hoping for &#8212; WordPress.com even strips out iframes &#8212; but it&#8217;s quick and easy and does the job.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/342/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/342/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=342&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/07/19/web-publishing-via-dropbox/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>

		<media:content url="http://jeffreybreen.files.wordpress.com/2011/07/dropbox-copy-public-link.png" medium="image">
			<media:title type="html">Dropbox &#62; Copy Public Link</media:title>
		</media:content>
	</item>
		<item>
		<title>One-liners which make me love R: Make your data dance (Hans Rosling style) with googleVis #rstats</title>
		<link>http://jeffreybreen.wordpress.com/2011/07/14/r-one-liners-googlevis/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/07/14/r-one-liners-googlevis/#comments</comments>
		<pubDate>Fri, 15 Jul 2011 04:42:46 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[One-liners]]></category>
		<category><![CDATA[Google Visualization]]></category>
		<category><![CDATA[googleVis]]></category>
		<category><![CDATA[lazy]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[visualization]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=331</guid>
		<description><![CDATA[This inaugural post in my "one-liners which make me love R" series highlights the googleVis package which makes it easy to use the Google Visualization API from R. Thanks to googleVis, just one line of R generates the 165 lines of HTML and (mostly) JavaScript required to create a Hans Rosling-style motion chart for some sample data.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=331&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>It may be a cliché, but much of R&#8217;s utility comes from its amazing community. And by community, I am specifically referring to the bright, hard-working people who are willing to share their knowledge and code with the rest of us. Because of their contributions, we can do some amazingly cool and useful things with very little code of our own. It is in this context that I launch this new series to highlight packages and functions which make it easy to do jaw-droppingly cool and useful things.</p>
<p>First up: the <a href="http://cran.r-project.org/web/packages/googleVis/">googleVis package</a> by Markus Gesmann and Diego de Castillo which makes it easy &#8212; often with just one-line of R &#8212; to harness the <a href="http://code.google.com/apis/chart/">Google Visualization API</a>. Annotated timelines, gauges, maps, org charts, tree maps, and more are suddenly at your command.</p>
<p>I&#8217;m going to focus on the motion chart, popularized by Hans Rosling in his <a href="http://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html">groundbreaking 2006 TED talk on global economic development</a>. (If you haven&#8217;t seen it yet, you should. Right now. Seriously. <a href="http://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html">Go.</a>) Motion charts are an innovative way to display multidimensional time series in an interactive way. And the googleVis package even comes with some sample data to make it even easier to try them out.</p>
<p>The package is available from CRAN if you need to install it.</p>
<p>To get started, load the package and the included &#8220;Fruits&#8221; <code>data.frame</code>:</p>
<p><pre class="brush: r; light: true;">
library(googleVis)
data(Fruits)
</pre></p>
<p>This <code>data.frame</code> contains some sample data about sales of various fruits at different locations for different years. There&#8217;s even a proper <code>Date</code> column already constructed for us from the <code>numeric</code> Year column:</p>
<p><img src="http://jeffreybreen.files.wordpress.com/2011/07/fruits.png?w=780" alt="" title="Fruits"   class="alignnone size-full wp-image-368" /></p>
<p>To make the chart, we need to give the <code>gvisMotionChart()</code> function our <code>data.frame</code> and tell it a few things about it: the column which identifies the items to examine (<code>idvar=Fruit</code>), the time dimension (<code>timevar=Date</code>), and optionally a name to use to identify the chart in the generated HTML and JavaScript (we&#8217;ll use <code>chartid="ILoveFruits"</code>):</p>
<p><pre class="brush: r; light: true;">
M = gvisMotionChart(data=Fruits, idvar=&quot;Fruit&quot;, timevar=&quot;Date&quot;, chartid=&quot;ILoveFruit&quot;)
</pre></p>
<p>That&#8217;s it.</p>
<p>You can view your chart with the overridden <code>plot()</code> function. It will automatically spawn a browser window and serve up your chart through R&#8217;s internal web server:</p>
<p><pre class="brush: r; light: true;">
plot(M)
</pre></p>
<p><em>Since WordPress doesn&#8217;t allow embedded JavaScript, <a href="http://dl.dropbox.com/u/4839225/www/motioncharts/ILoveFruits.html">please click through to see the motion chart in action</a>:</em><br />
<a href="http://dl.dropbox.com/u/4839225/www/motioncharts/ILoveFruits.html"><img src="http://jeffreybreen.files.wordpress.com/2011/07/ilovefruits.png?w=518&#038;h=483" alt="" title="ILoveFruits" width="518" height="483" class="alignnone" /></a></p>
<p>You can also access all 165 lines of the generated HTML and JavaScript and save it to disk:</p>
<p><pre class="brush: r; light: true;">
cat(unlist(M$html), file=&quot;output/ILoveFruits.html&quot;)
</pre></p>
<p>Time suck alert: googleVis may make them easy to create, but motion charts can be a lot of fun to play with. You have been warned&#8230;</p>
<p>If you want to take a look at an example with some real data, you might be interested in the <a href="http://www.cambridge.aero/_blog/main/post/US_Domestic_Airline_Market_In_Motion_1990-2010/">20 Years of the U.S. Domestic Airline Market In 20 seconds</a> post on my work blog.</p>
<p>Finally, here are the slides from my lightning talk on this topic at this month&#8217;s Greater Boston useR Group meeting:</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/8590025' width='780' height='639'></iframe>
<p>Have fun!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/331/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/331/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=331&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/07/14/r-one-liners-googlevis/feed/</wfw:commentRss>
		<slash:comments>9</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>

		<media:content url="http://jeffreybreen.files.wordpress.com/2011/07/fruits.png" medium="image">
			<media:title type="html">Fruits</media:title>
		</media:content>

		<media:content url="http://jeffreybreen.files.wordpress.com/2011/07/ilovefruits.png?w=300" medium="image">
			<media:title type="html">ILoveFruits</media:title>
		</media:content>
	</item>
		<item>
		<title>installing R 2.13.1 on Amazon EC2&#8242;s &#8220;Amazon Linux&#8221; AMI #rstats</title>
		<link>http://jeffreybreen.wordpress.com/2011/07/08/install-r-amazon-linux-ec2/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/07/08/install-r-amazon-linux-ec2/#comments</comments>
		<pubDate>Fri, 08 Jul 2011 18:27:32 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Sysadmin]]></category>
		<category><![CDATA[cloud computing]]></category>
		<category><![CDATA[EC2]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[server]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=324</guid>
		<description><![CDATA[Condensed from this post (and comments) on David Chudzicki&#8217;s blog, tweaked, and updated for R-2.13.1. Assumes you&#8217;re starting with a virgin &#8220;Amazon Linux&#8221; AMI. I picked &#8220;Basic 64-bit Amazon Linux AMI 2011.02.1 Beta&#8221; (AMI Id: ami-8e1fece7) because it was marked as free tier eligible on the &#8220;Quick Start&#8221; tab of AWS&#8217;s &#8220;Launch Instance&#8221; dialog box: [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=324&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Condensed from <a href="http://blog.davidchudzicki.com/2011/02/installing-r-on-amazon-linux.html">this post (and comments) on David Chudzicki&#8217;s blog</a>, tweaked, and updated for R-2.13.1. </p>
<p>Assumes you&#8217;re starting with a virgin &#8220;Amazon Linux&#8221; AMI. I picked &#8220;Basic 64-bit Amazon Linux AMI 2011.02.1 Beta&#8221; (AMI Id: ami-8e1fece7) because it was marked as free tier eligible on the &#8220;Quick Start&#8221; tab of AWS&#8217;s &#8220;Launch Instance&#8221; dialog box:</p>
<p><pre class="brush: plain; light: true;">
$ sudo yum -y install make libX11-devel.* libICE-devel.* libSM-devel.* libdmx-devel.* libx* xorg-x11* libFS* libX*  readline-devel gcc-gfortran gcc-c++ texinfo tetex

$ wget http://cran.r-project.org/src/base/R-2/R-2.13.1.tar.gz

$ tar zxf R-2.13.1.tar.gz &amp;&amp; cd R-2.13.1
$ ./configure &amp;&amp; make

$ # make coffee... or finish your PhD thesis... (yes, it takes that long)
[...]
$ # finally, if all is well:

$ sudo make install

$ cd
$ R --version
R version 2.13.1 (2011-07-08)
Copyright (C) 2011 The R Foundation for Statistical Computing
ISBN 3-900051-07-0
Platform: x86_64-unknown-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under the terms of the
GNU General Public License version 2.
For more information about these matters see
http://www.gnu.org/licenses/.
</pre></p>
<p>As always, refer to the <a href="http://cran.r-project.org/doc/manuals/R-admin.pdf">Installation and Administration manual</a> for details and options.</p>
<p>If you want to install RCurl, or anything which depends on it like twitteR, you&#8217;ll need to install libcurl &amp; friends first:</p>
<p><pre class="brush: plain; light: true;">
$ sudo yum -y install libcurl libcurl-devel
</pre></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/324/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=324&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/07/08/install-r-amazon-linux-ec2/feed/</wfw:commentRss>
		<slash:comments>28</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
		<item>
		<title>slides from my R tutorial on Twitter text mining #rstats</title>
		<link>http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/</link>
		<comments>http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/#comments</comments>
		<pubDate>Mon, 04 Jul 2011 18:56:31 +0000</pubDate>
		<dc:creator>Jeffrey Breen</dc:creator>
				<category><![CDATA[Tutorials]]></category>
		<category><![CDATA[airlines]]></category>
		<category><![CDATA[Boston Predictive Analytics]]></category>
		<category><![CDATA[doBy]]></category>
		<category><![CDATA[ggplot2]]></category>
		<category><![CDATA[Hu & Liu]]></category>
		<category><![CDATA[plyr]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[sentiment analysis]]></category>
		<category><![CDATA[text mining]]></category>
		<category><![CDATA[tm]]></category>

		<guid isPermaLink="false">http://jeffreybreen.wordpress.com/?p=311</guid>
		<description><![CDATA[Update: An expanded version of this tutorial will appear in the new Elsevier book Practical Text Mining and Statistical Analysis for Non-structured Text Data Applications by Gary Miner et. al which is now available for pre-order from Amazon. In conjunction with the book, I have cleaned up the tutorial code and published it on github. [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=311&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p><b>Update:</b> An expanded version of this tutorial will appear in the new Elsevier book <a href="http://ow.ly/63NWl">Practical Text Mining and Statistical Analysis for Non-structured Text Data Applications</a> by Gary Miner <i>et. al</i> which is now <a href="http://ow.ly/63NWl">available for pre-order from Amazon</a>.</p>
<p>In conjunction with the book, I have cleaned up the tutorial code and <a href="https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107">published it on github.</a></p>
<hr />
<p>Last month I presented this introduction to R at the <a href="http://www.meetup.com/Boston-Predictive-Analytics/events/17462846/">Boston Predictive Analytics MeetUp</a> on Twitter Sentiment. </p>
<p>The goal of the presentation was to expose a first-time (but technically savvy) audience to working in R. The scenario we work through is to estimate the sentiment expressed in tweets about major U.S. airlines. Even with a tiny sample and a very crude algorithm (simply counting the number of positive vs. negative words), we find a believable result. We conclude by comparing our result with scores we scrape from the <a href="http://www.theacsi.org/">American Consumer Satisfaction Index web site</a>.</p>
<p>Jeff Gentry&#8217;s twitteR package makes it easy to fetch the tweets. Also featured are the plyr, ggplot2, doBy, and XML packages. A real analysis would, no doubt, lean heavily on the tm text mining package for stemming, etc.</p>
<p>Here is the slimmed-down version of the slides:</p>
<iframe src='http://www.slideshare.net/slideshow/embed_code/8504941' width='780' height='639'></iframe>
<p>And here&#8217;s a <a href="http://ow.ly/5Bn6K">PDF version</a> to download.</p>
<p>Special thanks to John Verostek for putting together such an interesting event, and for providing valuable feedback and help with these slides.</p>
<hr />
<p><strong>Update:</strong> thanks to eagle-eyed <a href="http://twitter.com/cdhowe">Carl Howe</a> for noticing a slightly out-of-date version of the <code>score.sentiment()</code> function in the deck. Missing was handling for <code>NA</code> values from <code>match()</code>. The deck has been updated and the code is reproduced here for convenience:</p>
<p><pre class="brush: r; light: true;">

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
	require(plyr)
	require(stringr)
	
	# we got a vector of sentences. plyr will handle a list
	# or a vector as an &quot;l&quot; for us
	# we want a simple array (&quot;a&quot;) of scores back, so we use 
	# &quot;l&quot; + &quot;a&quot; + &quot;ply&quot; = &quot;laply&quot;:
	scores = laply(sentences, function(sentence, pos.words, neg.words) {
		
		# clean up sentences with R's regex-driven global substitute, gsub():
		sentence = gsub('[[:punct:]]', '', sentence)
		sentence = gsub('[[:cntrl:]]', '', sentence)
		sentence = gsub('\\d+', '', sentence)
		# and convert to lower case:
		sentence = tolower(sentence)

		# split into words. str_split is in the stringr package
		word.list = str_split(sentence, '\\s+')
		# sometimes a list() is one level of hierarchy too much
		words = unlist(word.list)

		# compare our words to the dictionaries of positive &amp; negative terms
		pos.matches = match(words, pos.words)
		neg.matches = match(words, neg.words)
	
		# match() returns the position of the matched term or NA
		# we just want a TRUE/FALSE:
		pos.matches = !is.na(pos.matches)
		neg.matches = !is.na(neg.matches)

		# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
		score = sum(pos.matches) - sum(neg.matches)

		return(score)
	}, pos.words, neg.words, .progress=.progress )

	scores.df = data.frame(score=scores, text=sentences)
	return(scores.df)
}
</pre></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/jeffreybreen.wordpress.com/311/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/jeffreybreen.wordpress.com/311/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=jeffreybreen.wordpress.com&#038;blog=1195963&#038;post=311&#038;subd=jeffreybreen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/feed/</wfw:commentRss>
		<slash:comments>111</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/78f86d3c15d7ca35a86e36937b01cc2d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">jeffreybreen</media:title>
		</media:content>
	</item>
	</channel>
</rss>
